size Class — pytorch Architecture

Architecture documentation for the size class in vec256_qint.h from the pytorch codebase.
Class c
Entity Profile

Source Code

aten/src/ATen/cpu/vec/vec256/vec256_qint.h lines 292–426
template <>
struct Vectorized<c10::qint32> : public Vectorizedqi {
  using size_type = int;
  static constexpr size_type kSize = Vectorized<int>::size();
  static constexpr size_type size() {
    return kSize;
  }

  static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
  static constexpr int float_num_vecs() {
    return kFloatNumVecs;
  }

  static constexpr int int_num_vecs() {
    return 1;
  }

  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
  using value_type = c10::qint32::underlying;

 public:
  using Vectorizedqi::Vectorizedqi;
  Vectorized() {}

  Vectorized(__m256i vals_) {
    vals = vals_;
  }

  // Broadcast constructor
  Vectorized(const c10::qint32& val) {
    value_type uw = val.val_;
    vals = _mm256_set1_epi32(uw);
  }

  void store(void* ptr, int count = size()) const {
    if (count != size()) {
      memcpy(ptr, &vals, count * sizeof(value_type));
    } else {
      _mm256_storeu_si256((__m256i*)ptr, vals);
    }
  }

  static Vectorized<c10::qint32> loadu(const void* ptr) {
    return Vectorized<c10::qint32>(ptr);
  }

  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
    __at_align__ value_type tmp_values[size()];
    // Ensure uninitialized memory does not change the output value See
    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
    // not initialize arrays to zero using "={0}" because gcc would compile it
    // to two instructions while a loop would be compiled to one instruction.
    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(
        tmp_values,
        reinterpret_cast<const value_type*>(ptr),
        count * sizeof(value_type));
    return _mm256_loadu_si256((const __m256i*)tmp_values);
  }

  float_vec_return_type dequantize(
      Vectorized<float> scale,
      Vectorized<float> /*zero_point*/,
      Vectorized<float> scale_zp_premul) const {
    __m256 float_vals = _mm256_cvtepi32_ps(vals);
    return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
  }

  float_vec_return_type dequantize(
      Vectorized<float> scale,
      Vectorized<float> zero_point) const {
    __m256 float_vals = _mm256_cvtepi32_ps(vals);
    return {(Vectorized<float>(float_vals) - zero_point) * scale};
  }

  static Vectorized<c10::qint32> quantize(
      const float_vec_return_type& rhs,
      float scale,
      int32_t zero_point,
      float /*inverse_scale*/) {
    Vectorized<c10::qint32> retval;
    auto rhs_data = (__m256)rhs[0];
    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
        scale,
        zero_point,
        (float*)&rhs_data,
        (c10::qint32*)&retval.vals,
        size());
    return retval;
  }

  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
    return _mm256_max_epi32(vals, b.vals);
  }

  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
    return _mm256_min_epi32(vals, b.vals);
  }

  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
    return maximum(zero_point);
  }

  Vectorized<c10::qint32> relu6(
      Vectorized<c10::qint32> zero_point,
      Vectorized<c10::qint32> q_six) {
    return _mm256_min_epi32(
        _mm256_max_epi32(vals, zero_point.vals), q_six.vals);
  }

  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
    return {_mm256_sub_epi32(vals, b)};
  }

  static Vectorized<c10::qint32> requantize_from_int(
      const int_vec_return_type& inp,
      float multiplier,
      int32_t zero_point) {
    __m256 multiplier_v = _mm256_set1_ps(multiplier);
    __m256i zero_point_v = _mm256_set1_epi32(zero_point);

    __m256 scaled = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier_v);
    __m256i rounded = _mm256_cvtps_epi32(scaled);
    return _mm256_add_epi32(rounded, zero_point_v);
  }

 private:
  // Load from memory constructor
  Vectorized(const void* ptr) {
    vals = _mm256_loadu_si256((const __m256i*)ptr);
  }
};
Source

View on GitHub
Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free