_qadaptive_avg_pool_kernel Class — pytorch Architecture
Architecture documentation for the _qadaptive_avg_pool_kernel class in QuantizedOpKernels.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp lines 1792–1901
template <typename T>
void _qadaptive_avg_pool_kernel(
const Tensor& qx,
Tensor& qy,
int64_t nBatch,
int64_t sizeC,
int64_t isizeD, // Set to 1 for 2d
int64_t isizeH,
int64_t isizeW,
int64_t osizeD, // Set to 1 for 2d
int64_t osizeH,
int64_t osizeW,
int64_t istrideB,
int64_t istrideC,
int64_t istrideD, // Set to 1 for 2d
int64_t istrideH,
int64_t istrideW) {
T* idata = static_cast<T*>(qx.data_ptr());
T* odata = static_cast<T*>(qy.data_ptr());
const float input_scale = qx.q_scale();
const float output_scale = qy.q_scale();
const int input_zero_point = qx.q_zero_point();
const int output_zero_point = qy.q_zero_point();
at::parallel_for(0, nBatch, 0, [&](int64_t batch_start, int64_t batch_end) {
for (const auto b : c10::irange(batch_start, batch_end)) {
auto* i_p = reinterpret_cast<typename T::underlying*>(
idata + b * istrideB);
for (const auto od : c10::irange(osizeD)) {
int istartD = (int)std::floor((float)(od * isizeD) / osizeD);
int iendD = (int)std::ceil((float)((od + 1) * isizeD) / osizeD);
int kD = iendD - istartD;
for (const auto oh : c10::irange(osizeH)) {
int istartH = (int)std::floor((float)(oh * isizeH) / osizeH);
int iendH = (int)std::ceil((float)((oh + 1) * isizeH) / osizeH);
int kH = iendH - istartH;
for (const auto ow : c10::irange(osizeW)) {
auto* o_p = reinterpret_cast<typename T::underlying*>(
odata +
b * osizeD * osizeH * osizeW * sizeC +
od * osizeH * osizeW * sizeC +
oh * osizeW * sizeC +
ow * sizeC);
int istartW = (int)std::floor((float)(ow * isizeW) / osizeW);
int iendW = (int)std::ceil((float)((ow + 1) * isizeW) / osizeW);
int kW = iendW - istartW;
int size = kD * kH * kW;
float multiplier = input_scale / output_scale / size;
int input_zero_point_m_size = -input_zero_point * size;
int64_t c = 0;
// For int8 or uint8quantization, we implicitly use int32 as
// accumulation Or else, it will go to the slow path
// TODO: support 16bit, 32bit, and etc.
auto* internal_i_p = i_p +
istartD * istrideD +
istartH * istrideH +
istartW * istrideW;
// Note: If AVX is not available, `do_avg_pool_on_AVX_n is a noop.
// In that case, the following loop takes over
// TODO: more vectorization with loop interleaving
do_avg_pool_on_AVX_n<T>(
internal_i_p,
o_p,
c,
sizeC,
1,
input_zero_point_m_size,
output_zero_point,
multiplier,
0,
kD,
0,
kH,
0,
kW,
istrideC,
istrideD,
istrideH,
istrideW);
// 1) The following loop handles the remaining channels
// 2) It also handles the Non-AVX2 path
for (; c < sizeC; ++c) {
int32_t acc_int32 = input_zero_point_m_size;
int64_t tcntr = 0;
for (const auto id : c10::irange(kD)) {
for (const auto ih : c10::irange(kH)) {
for (const auto iw : c10::irange(kW)) {
tcntr = id * istrideD +
ih * istrideH +
iw * istrideW;
auto val = *(internal_i_p + tcntr + c * istrideC);
acc_int32 += val;
}
}
}
// clamp
o_p[c] = at::native::quantize_val<T>(1.0f / multiplier,
output_zero_point,
acc_int32).val_;
} // c
} // oh
} // ow
} // od
}
});
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free