_qmaxpool_2d_nhwc_kernel Class — pytorch Architecture

Architecture documentation for the _qmaxpool_2d_nhwc_kernel class in QuantizedOpKernels.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp lines 1431–1530

template <typename scalar_t, typename scalar_t_underlying>
void _qmaxpool_2d_nhwc_kernel(
    const Tensor& qx,
    int64_t iC, // input/output channels
    int64_t iH,
    int64_t iW, // input sizes
    int64_t oH,
    int64_t oW, // output sizes
    int64_t kH,
    int64_t kW, // kernel size
    int64_t sH,
    int64_t sW, // strides
    int64_t pH,
    int64_t pW, // padding
    int64_t dH,
    int64_t dW, // dilation
    Tensor& qy) {
    scalar_t* idata = static_cast<scalar_t*>(qx.data_ptr());
    scalar_t* odata = static_cast<scalar_t*>(qy.data_ptr());

    int64_t nBatch = qx.size(0);
    at::parallel_for(0, nBatch * oH * oW, 0, [&](int64_t begin, int64_t end) {
      int64_t b{0}, row{0}, col{0};
      data_index_init(begin, b, nBatch, row, oH, col, oW);

      for (const auto i : c10::irange(begin, end)) {
        auto* i_p = reinterpret_cast<scalar_t_underlying*>(idata + b * iW * iH * iC);
        auto* o_p = reinterpret_cast<scalar_t_underlying*>(odata + i * iC);

        // Loop over reduction block
        int64_t h_start = row * sH - pH;
        int64_t w_start = col * sW - pW;
        int64_t h_end = std::min(h_start + (kH - 1) * dH + 1, iH);
        int64_t w_end = std::min(w_start + (kW - 1) * dW + 1, iW);
        while (h_start < 0)
          h_start += dH;
        while (w_start < 0)
          w_start += dW;

        int64_t c = 0;

        // Interleaved vector loop 4x
        constexpr auto vec_width = Vectorized<scalar_t>::size();
        for (; c + 4 * vec_width <= iC; c += 4 * vec_width) {
          Vectorized<scalar_t> acc{
              scalar_t(std::numeric_limits<scalar_t_underlying>::lowest())};
          // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
          Vectorized<scalar_t> accs[4] = {acc, acc, acc, acc};
          int64_t tcntr = 0;
          int64_t x, y;
          for (y = h_start; y < h_end; y += dH) {
            for (x = w_start; x < w_end; x += dW) {
              for (const auto i : c10::irange(4)) {
                tcntr = y * iW + x;
                auto vals = Vectorized<scalar_t>::loadu(
                    i_p + tcntr * iC + c + Vectorized<scalar_t>::size() * i);
                accs[i] = vec::maximum(accs[i], vals);
              }
            } // for x
          } // for y
          for (const auto i : c10::irange(4)) {
            accs[i].store(o_p + c + Vectorized<scalar_t>::size() * i);
          }
        } // for c

        // Vector loop
        for (; c + vec_width <= iC; c += vec_width) {
          Vectorized<scalar_t> acc{
              scalar_t(std::numeric_limits<scalar_t_underlying>::lowest())};
          int64_t tcntr = 0;
          int64_t x, y;
          for (y = h_start; y < h_end; y += dH) {
            for (x = w_start; x < w_end; x += dW) {
              tcntr = y * iW + x;
              auto vals = Vectorized<scalar_t>::loadu(i_p + tcntr * iC + c);
              acc = vec::maximum(acc, vals);
            } // for x
          } // for y
          acc.store(o_p + c);
        } // for c

        for (; c < iC; ++c) {
          auto max_val = std::numeric_limits<scalar_t_underlying>::lowest();
          int64_t tcntr = 0;
          int64_t x, y;
          for (y = h_start; y < h_end; y += dH) {
            for (x = w_start; x < w_end; x += dW) {
              tcntr = y * iW + x;
              auto val = *(i_p + tcntr * iC + c);
              max_val = std::max(max_val, val);
            } // for x
          } // for y

          o_p[c] = max_val;
        } // for c

        data_index_step(b, nBatch, row, oH, col, oW);
      }
    });
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free