Home / Class/ RequantizeAvx2 Class — pytorch Architecture

RequantizeAvx2 Class — pytorch Architecture

Architecture documentation for the RequantizeAvx2 class in vec256_qint.h from the pytorch codebase.

Entity Profile

Source Code

aten/src/ATen/cpu/vec/vec256/vec256_qint.h lines 452–493

template <typename T>
__m256i RequantizeAvx2(
    const std::array<Vectorized<c10::qint32>, 4>& inp,
    __m256 multiplier,
    __m256i zp) {
  static_assert(
      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
      "Only int8_t/uint8_t are supported");
  constexpr auto min_val = std::numeric_limits<T>::min();
  constexpr auto max_val = std::numeric_limits<T>::max();
  __m256i permute_mask_v =
      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
  __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier);
  __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[1]), multiplier);
  __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[2]), multiplier);
  __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[3]), multiplier);

  __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
  __m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v);
  __m256i z_rounded_v = _mm256_cvtps_epi32(z_scaled_v);
  __m256i w_rounded_v = _mm256_cvtps_epi32(w_scaled_v);

  /* Add zero point */
  __m256i x_v = _mm256_add_epi32(x_rounded_v, zp);
  __m256i y_v = _mm256_add_epi32(y_rounded_v, zp);
  __m256i z_v = _mm256_add_epi32(z_rounded_v, zp);
  __m256i w_v = _mm256_add_epi32(w_rounded_v, zp);

  /* Pack to int16_t and saturate */
  __m256i xy_packed_v = _mm256_packs_epi32(x_v, y_v);
  __m256i zw_packed_v = _mm256_packs_epi32(z_v, w_v);

  __m256i xyzw_clamped_v =
      pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);

  /*
   * xyzw_clamped_v has results in the following layout so we need to
   * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7
   */
  xyzw_clamped_v = _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
  return xyzw_clamped_v;
}

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free