Home / Class/ _qadaptive_avg_pool_kernel Class — pytorch Architecture

_qadaptive_avg_pool_kernel Class — pytorch Architecture

Architecture documentation for the _qadaptive_avg_pool_kernel class in QuantizedOpKernels.cpp from the pytorch codebase.

Entity Profile

Source Code

aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp lines 1792–1901

template <typename T>
void _qadaptive_avg_pool_kernel(
    const Tensor& qx,
    Tensor& qy,
    int64_t nBatch,
    int64_t sizeC,
    int64_t isizeD,  // Set to 1 for 2d
    int64_t isizeH,
    int64_t isizeW,
    int64_t osizeD,  // Set to 1 for 2d
    int64_t osizeH,
    int64_t osizeW,
    int64_t istrideB,
    int64_t istrideC,
    int64_t istrideD,  // Set to 1 for 2d
    int64_t istrideH,
    int64_t istrideW) {

  T* idata = static_cast<T*>(qx.data_ptr());
  T* odata = static_cast<T*>(qy.data_ptr());

  const float input_scale = qx.q_scale();
  const float output_scale = qy.q_scale();
  const int input_zero_point = qx.q_zero_point();
  const int output_zero_point = qy.q_zero_point();

  at::parallel_for(0, nBatch, 0, [&](int64_t batch_start, int64_t batch_end) {
    for (const auto b : c10::irange(batch_start, batch_end)) {
      auto* i_p = reinterpret_cast<typename T::underlying*>(
          idata + b * istrideB);

      for (const auto od : c10::irange(osizeD)) {
        int istartD = (int)std::floor((float)(od * isizeD) / osizeD);
        int iendD = (int)std::ceil((float)((od + 1) * isizeD) / osizeD);
        int kD = iendD - istartD;
        for (const auto oh : c10::irange(osizeH)) {
          int istartH = (int)std::floor((float)(oh * isizeH) / osizeH);
          int iendH = (int)std::ceil((float)((oh + 1) * isizeH) / osizeH);
          int kH = iendH - istartH;
          for (const auto ow : c10::irange(osizeW)) {
            auto* o_p = reinterpret_cast<typename T::underlying*>(
                odata +
                b * osizeD * osizeH * osizeW * sizeC +
                od * osizeH * osizeW * sizeC +
                oh * osizeW * sizeC +
                ow * sizeC);
            int istartW = (int)std::floor((float)(ow * isizeW) / osizeW);
            int iendW = (int)std::ceil((float)((ow + 1) * isizeW) / osizeW);
            int kW = iendW - istartW;
            int size = kD * kH * kW;
            float multiplier = input_scale / output_scale / size;
            int input_zero_point_m_size = -input_zero_point * size;
            int64_t c = 0;
            // For int8 or uint8quantization, we implicitly use int32 as
            // accumulation Or else, it will go to the slow path
            // TODO: support 16bit, 32bit, and etc.
            auto* internal_i_p = i_p +
                                istartD * istrideD +
                                istartH * istrideH +
                                istartW * istrideW;

            // Note: If AVX is not available, `do_avg_pool_on_AVX_n is a noop.
            //       In that case, the following loop takes over
            // TODO: more vectorization with loop interleaving
            do_avg_pool_on_AVX_n<T>(
                internal_i_p,
                o_p,
                c,
                sizeC,
                1,
                input_zero_point_m_size,
                output_zero_point,
                multiplier,
                0,
                kD,
                0,
                kH,
                0,
                kW,
                istrideC,
                istrideD,
                istrideH,
                istrideW);
            // 1) The following loop handles the remaining channels
            // 2) It also handles the Non-AVX2 path
            for (; c < sizeC; ++c) {
              int32_t acc_int32 = input_zero_point_m_size;
              int64_t tcntr = 0;
              for (const auto id : c10::irange(kD)) {
                for (const auto ih : c10::irange(kH)) {
                  for (const auto iw : c10::irange(kW)) {
                    tcntr = id * istrideD +
                        ih * istrideH +
                        iw * istrideW;
                    auto val = *(internal_i_p + tcntr + c * istrideC);
                    acc_int32 += val;
                  }
                }
              }
              // clamp
              o_p[c] = at::native::quantize_val<T>(1.0f / multiplier,
                                                          output_zero_point,
                                                          acc_int32).val_;
            } // c
          } // oh
        } // ow
      } // od
    }
  });
}

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free