quantize_tensor_arm_q8 Class — pytorch Architecture

Architecture documentation for the quantize_tensor_arm_q8 class in QuantizedOpKernels.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp lines 3586–3659

template <typename scalar_t, typename underlying_t, typename underlying_x8_t>
void quantize_tensor_arm_q8(
    const float* __restrict__ in,
    scalar_t* __restrict__ out,
    const int64_t N,
    const float scale,
    const int32_t zero_point) {
  const float inv_scale = 1.0f / scale;
  uint32_t i = 0;
  underlying_t* out_underlying = reinterpret_cast<underlying_t*>(out);
  const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
#if defined(__ARM_NEON__)
  // magic float and magic int to take care of rounding
  // int magic_round(float f): interpret_int32(f + 12582912.0f) - 0x4B400000
  // Some detail:
  // 12582912.0f is 2**23 + 2**22. The trick is based on the fact that when you
  // add a small number to a large number, the result rounds to the precision of
  // the least significant bit of the large number. For IEEE-754
  // single-precision number mantissa has 23 bits, and adding 2**23 would cause
  // rounding to the nearest even integer. The we cast to int and subtract the
  // same number (0x4B400000 is the integer representation of 12582912.0f) to
  // get only the mantissa. This works if -2**22 < x < 2**22, but preserves the
  // sign for negative numbers.
  const int32x4_t voffset = vdupq_n_s32(zero_point - 0x4B400000);
  const float32x4_t vmagic_float = vdupq_n_f32(12582912.0f);
  for (i = 0; i + 8 <= N; i += 8) {
    const float32x4_t vin0123 = vld1q_f32(in);
    in += 4;
    const float32x4_t vin4567 = vld1q_f32(in);
    in += 4;
    const int32x4_t vraw0123 = vaddq_s32(
        voffset,
        vreinterpretq_s32_f32(
            vaddq_f32(vmagic_float, vmulq_f32(vin0123, vinv_scale))));
    const int32x4_t vraw4567 = vaddq_s32(
        voffset,
        vreinterpretq_s32_f32(
            vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale))));
    const int16x8_t vraw01234567 =
        vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567));
    const underlying_x8_t vout01234567 =
        quantize_tensor_arm_intrinsics::vqmov<underlying_x8_t>(vraw01234567);
    quantize_tensor_arm_intrinsics::vst1<underlying_t, underlying_x8_t>(
        out_underlying, vout01234567);
    out_underlying += 8;
  }
  for (; i < N; ++i) {
    (*out_underlying++) =
        at::native::quantize_val_arm<underlying_t>(scale, zero_point, (*in++));
  }
#else
  const int16x8_t vzero_point = vdupq_n_s16((int16_t)(uint16_t)zero_point);
  for (i = 0; i + 8 <= N; i += 8) {
    const float32x4_t vin0123 = vld1q_f32(in);
    in += 4;
    const float32x4_t vin4567 = vld1q_f32(in);
    in += 4;
    const int32x4_t v0123_rounded = vcvtnq_s32_f32(vmulq_f32(vin0123, vinv_scale));
    const int32x4_t v4567_rounded = vcvtnq_s32_f32(vmulq_f32(vin4567, vinv_scale));
    const int16x8_t v01234567_packed = vqaddq_s16(
        vqmovn_high_s32(vqmovn_s32(v0123_rounded), v4567_rounded), vzero_point);
    const underlying_x8_t vout01234567 =
        quantize_tensor_arm_intrinsics::vqmov<underlying_x8_t>(
            v01234567_packed);
    quantize_tensor_arm_intrinsics::vst1<underlying_t, underlying_x8_t>(
        out_underlying, vout01234567);
    out_underlying += 8;
  }
  for (; i < N; ++i) {
    (*out_underlying++) =
        at::native::quantize_val_arm<underlying_t>(scale, zero_point, (*in++));
  }
#endif
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free