Home / Class/ ReluFused Class — pytorch Architecture

ReluFused Class — pytorch Architecture

Architecture documentation for the ReluFused class in qlinear.cpp from the pytorch codebase.

Entity Profile

Source Code

aten/src/ATen/native/quantized/cpu/qlinear.cpp lines 47–246

template <bool ReluFused>
at::Tensor& PackedLinearWeight::apply_impl(
    const at::Tensor& input,
    double output_scale,
    int64_t output_zero_point,
    at::Tensor& output) {
  // uint8 * int8 -> uint8 (no quantization/dequantization)

  // We make a strong guarantee that models using these operators will have
  // the same numerics across different machines. Therefore, we do not provide
  // a fallback path and rather fail loudly if we cannot run FBGEMM.
  TORCH_CHECK(
      fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
  TORCH_CHECK(input.scalar_type() == c10::kQUInt8,
                "Expected input data type ",
                toString(c10::kQUInt8),
                " but got ",
                toString(input.scalar_type()));

  // TODO: contiguous is called for further jit optimizations.
  auto input_contig = input.expect_contiguous();
  const auto* input_ptr =
      reinterpret_cast<uint8_t*>(input_contig->data_ptr<c10::quint8>());

  TORCH_CHECK(
      input.dim() >= 2,
      "The dimension of input tensor should be larger than or equal to 2");
  // C(output) = A(input) x B(weight), where C, A, B are M x N, M x K, K x N
  // matrices, respectively.
  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  int64_t M = size_to_dim_(input.dim() - 1, input.sizes());

  auto packB = w.get();

  int64_t N = static_cast<int64_t>(packB->numCols());
  int64_t K = input.sizes()[input.dim() - 1];
  TORCH_CHECK(
      K == static_cast<int64_t>(packB->numRows()),
      "The number of rows in the packB should be equal to K: " +
          std::to_string(K));

  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  float input_scale_float = input.q_scale();
  int32_t input_zero_point_int32 = input.q_zero_point();

  std::vector<float> output_multiplier_float(1, 0.0);
  std::vector<float> act_times_w_scale(1, 0.0);
  TORCH_CHECK(
      w_scale.size() == w_zp.size(),
      "Weight scales and zero points vectors should have the same size.");
  if (q_scheme == c10::kPerTensorAffine) {
    // Process the per tensor quantization.
    act_times_w_scale[0] = (input_scale_float * w_scale[0]);
    output_multiplier_float[0] =
        act_times_w_scale[0] / static_cast<float>(output_scale);
  } else if (q_scheme == c10::kPerChannelAffine) {
    // Process the per channel quantization.
    output_multiplier_float.resize(N, 0.0);
    act_times_w_scale.resize(N, 1.0f);
    for (const auto i : c10::irange(N)) {
      act_times_w_scale[i] = (input_scale_float * w_scale[i]);
      output_multiplier_float[i] =
          act_times_w_scale[i] / static_cast<float>(output_scale);
    }
  }
  int32_t output_zero_point_int32 = static_cast<int32_t>(output_zero_point);

  const float* bias_ptr = nullptr;
  c10::MaybeOwned<at::Tensor> bias_contig;
  if (this->bias_.has_value()) {
    auto& bias = this->bias_.value();
    bias_contig = bias.expect_contiguous();
    TORCH_CHECK(bias_contig->dim() == 1, "bias should be a vector (1D Tensor)");
    TORCH_CHECK(
        bias_contig->sizes()[0] == N, "bias should have N elements: " + std::to_string(N));
    bias_ptr = reinterpret_cast<float*>(bias_contig->data_ptr<float>());
  }

  // The resulting matrix here is 2-D, let's view it with the original
  // left hand dimensions of the input. Here are two examples:
  // 1. If the input tensor is {M, K}, the output tensor is {M, N}.
  // 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
  at::DimVector out_sizes(input.sizes());
  out_sizes.back() = N;
  // Resize output Tensor
  output.resize_(out_sizes);

  // Allocate a buffer for fbgemmPacked to use
  auto buffer = at::empty(out_sizes, output.options().dtype(at::kInt));

  auto output_data = reinterpret_cast<uint8_t*>(output.data_ptr<c10::quint8>());

  int num_tasks = at::get_num_threads();
  at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
    for (const auto task_id : c10::irange(begin, end)) {
      // This operation does the following:
      // 1) Creates a "row buffer" vector with offset values that must be
      //    added to the integer matrix multiplication operation to ensure
      //    correctness. This "row buffer" is also called the row offset, and
      //    it is needed when we use affine quantization for weights.
      // 2) Packs the resulting quantized matrix into vector-register and
      //    cache friendly tiles.
      //
      //  Note this is not executed eagerly, but rather within the
      //  fbgemmPacked call below.
      fbgemm::PackAWithRowOffset<uint8_t> packA(
          /*trans=*/fbgemm::matrix_op_t::NoTranspose,
          /*nRow=*/M,
          /*nCol=*/K,
          /*smat=*/input_ptr,
          /*ld=*/K,
          /*pmat=*/nullptr); // Currently, packA manages ownership of `pmat`.
                             // TODO: Consider a way to pre-allocate and reuse
                             // pmat buffer.

      // ReQuantizeOutput requires pointers to the zero point values,
      // since in the case of rowwise quantization these will be arrays rather
      // than scalars. But in this case, we're doing whole-tensor quantization
      // so we just pass a pointer to the scale values (and internally
      // ReQuantizeOutput won't index past 0.

      // This is the end of the pipeline, pass the resulting matrix through.
      fbgemm::DoNothing<> doNothingObj{};

      if (q_scheme == c10::kPerTensorAffine) {
        // Process the per tensor quantization.
        //
        // After the uint8 * int8 matrix multiplication is performed, this
        // operation does:
        //  1) Add in row and column offsets to the rows and columns,
        //  respectively.
        //  2) Add in the bias term.
        fbgemm::ReQuantizeOutput<
            ReluFused,
            fbgemm::QuantizationGranularity::TENSOR,
            float>
            outputProcObj(
                doNothingObj,
                output_multiplier_float.data(),
                output_zero_point_int32,
                input_zero_point_int32,
                w_zp.data(),
                packA.getRowOffsetBuffer(),
                col_offsets.data(),
                bias_ptr,
                N, /* nCol */
                1 /* groups */,
                act_times_w_scale.data());

        // Do the GEMM
        fbgemm::fbgemmPacked(
            /*packA=*/packA,
            /*packB=*/*packB,
            /*C=*/output_data,
            /*C_buffer=*/buffer.data_ptr<int32_t>(),
            /*ldc=*/N,
            /*outProcess=*/outputProcObj,
            /*thread_id=*/task_id,
            /*num_threads=*/num_tasks);
      } else if (q_scheme == c10::kPerChannelAffine) {
        // Process the per channel quantization.
        //
        // After the uint8 * int8 matrix multiplication is performed, this
        // operation does:
        //  1) Add in row and column offsets to the rows and columns,
        //  respectively.
        //  2) Add in the bias term.
        fbgemm::ReQuantizeOutput<
            ReluFused,
            fbgemm::QuantizationGranularity::OUT_CHANNEL,
            float>
            outputProcObj(
                doNothingObj,
                output_multiplier_float.data(),
                output_zero_point_int32,
                input_zero_point_int32,
                w_zp.data(),
                packA.getRowOffsetBuffer(),
                col_offsets.data(),
                bias_ptr,
                N, /*nCol=*/
                1, /* groups*/
                act_times_w_scale.data());

        // Do the GEMM
        fbgemm::fbgemmPacked(
            /*packA=*/packA,
            /*packB=*/*packB,
            /*C=*/output_data,
            /*C_buffer=*/buffer.data_ptr<int32_t>(),
            /*ldc=*/N,
            /*outProcess=*/outputProcObj,
            /*thread_id=*/task_id,
            /*num_threads=*/num_tasks);
      }
    }
  });

  return output;
}

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free