RequantizeAvx2 Class — pytorch Architecture
Architecture documentation for the RequantizeAvx2 class in vec256_qint.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cpu/vec/vec256/vec256_qint.h lines 452–493
template <typename T>
__m256i RequantizeAvx2(
const std::array<Vectorized<c10::qint32>, 4>& inp,
__m256 multiplier,
__m256i zp) {
static_assert(
std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
"Only int8_t/uint8_t are supported");
constexpr auto min_val = std::numeric_limits<T>::min();
constexpr auto max_val = std::numeric_limits<T>::max();
__m256i permute_mask_v =
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
__m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier);
__m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[1]), multiplier);
__m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[2]), multiplier);
__m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[3]), multiplier);
__m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
__m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v);
__m256i z_rounded_v = _mm256_cvtps_epi32(z_scaled_v);
__m256i w_rounded_v = _mm256_cvtps_epi32(w_scaled_v);
/* Add zero point */
__m256i x_v = _mm256_add_epi32(x_rounded_v, zp);
__m256i y_v = _mm256_add_epi32(y_rounded_v, zp);
__m256i z_v = _mm256_add_epi32(z_rounded_v, zp);
__m256i w_v = _mm256_add_epi32(w_rounded_v, zp);
/* Pack to int16_t and saturate */
__m256i xy_packed_v = _mm256_packs_epi32(x_v, y_v);
__m256i zw_packed_v = _mm256_packs_epi32(z_v, w_v);
__m256i xyzw_clamped_v =
pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
/*
* xyzw_clamped_v has results in the following layout so we need to
* permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7
*/
xyzw_clamped_v = _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
return xyzw_clamped_v;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free