kReluFused Class — pytorch Architecture

Architecture documentation for the kReluFused class in qlinear.cpp from the pytorch codebase.
Class cpp
Entity Profile

Source Code

aten/src/ATen/native/quantized/cpu/qlinear.cpp lines 463–616
template <typename scalar_t, bool kReluFused>
at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
    const at::Tensor& input,
    double output_scale,
    int64_t output_zero_point) {
  using underlying_t = typename scalar_t::underlying;

  std::lock_guard<std::mutex> lock(qnnp_mutex_);

  const std::string func_name = kReluFused ? "quantized::linear_relu (xnnpack)"
                                           : "quantized::linear (xnnpack)";
  TORCH_CHECK(
      input.dim() >= 2, func_name, ": Input tensor rank should be >= 2.");
  TORCH_CHECK(
      !per_channel(),
      func_name,
      ": xnnpack does not currently have per_channel support.");

  const auto input_contig = input.contiguous();
  const auto input_scale = input_contig.q_scale();

  const size_t rows_w = bias_.size(0);
  const size_t cols_w = input_contig.size(input_contig.dim() - 1);

  auto status = xnn_status_invalid_state;

  // Create an operator iff not already created
  if (!xnnp_linear_op ||
      (!this->input_scale.has_value() ||
       this->input_scale.value() != input_scale)) {
    // Update the input scale so we may cache the op
    this->input_scale = input_scale;

    xnn_operator_t xnnp_op = nullptr;

    const float* weight_scales_data = w_scales.const_data_ptr<float>();

    // prepare weights
    underlying_t w_zp = static_cast<underlying_t>(
        orig_weight.q_zero_point() +
        (std::is_same_v<underlying_t, uint8_t> ? 128 : 0));

   at::Tensor xnnp_weight = at::_empty_affine_quantized(
        orig_weight.sizes(),
        c10::CppTypeToScalarType<scalar_t>::value,
        weight_scales_data[0],
        w_zp);

    // copy from the original weight and take care of dtype change if necessary
    at::native::xnnp_utils::q8_copy_int8_weight_and_add_offset<scalar_t>(
        orig_weight, xnnp_weight);

    // Original bias was float, so we requantize it here.
    at::Tensor qbias = quant_utils::QuantizeBias(false, bias_, orig_weight, input_scale);

    // output limits
   auto output_min = kReluFused
        // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
        ? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).first
        : std::numeric_limits<underlying_t>::min();
    auto output_max = kReluFused
        // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
        ? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).second
        : std::numeric_limits<underlying_t>::max();

    // Create an operator
    status = at::native::xnnp_utils::xnnp_create_fully_connected_nc(
        cols_w, /* input_channels */
        rows_w, /* output_channels */
        cols_w, /* input_stride */
        rows_w, /* output_stride */
        input_contig.q_zero_point(),
        input_contig.q_scale(),
        w_zp,
        weight_scales_data[0],
        reinterpret_cast<const underlying_t*>(
            xnnp_weight.template data_ptr<scalar_t>()),
        reinterpret_cast<int32_t*>(qbias.data_ptr<c10::qint32>()),
        output_zero_point,
        output_scale,
        output_min,
        output_max,
        0, /* flags */
        &xnnp_op);
    xnnp_linear_op = xnnpack_operator(xnnp_op);

    TORCH_CHECK(
        status == xnn_status_success,
        func_name,
        ": xnn create operator failed(",
        status,
        ")");
  }

  /*
   * Allocate output Tensor and a buffer for XNNPACK to use
   * The resulting matrix here is 2-D, let's view it with the original
   * left hand dimensions of the input. Here are two examples:
   * 1. If the input tensor is {M, K}, the output tensor is {M, N}.
   * 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
   */
  std::vector<int64_t> out_sizes = input.sizes().vec();
  out_sizes.back() = static_cast<int64_t>(rows_w);
  at::Tensor output = at::native::empty_affine_quantized(
      out_sizes,
      c10::CppTypeToScalarType<scalar_t>::value,
      std::nullopt /* layout */,
      c10::kCPU,
      std::nullopt /* pin_memory */,
      output_scale,
      output_zero_point,
      input.suggest_memory_format());

  // calculate batch_size
  size_t rows_input = 1;
  for (const auto i : c10::irange(input_contig.dim() - 1)) {
    rows_input *= input_contig.size(i);
  }

  // Reshape the operator
  status = at::native::xnnp_utils::xnnp_reshape_fully_connected_nc(
      xnnp_linear_op.get(),
      rows_input, /* batch_size */
      caffe2::pthreadpool_());

  // Setup the operator
  status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
      xnnp_linear_op.get(),
      reinterpret_cast<const underlying_t*>(
          input_contig.template data_ptr<scalar_t>()),
      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>())
    );

  TORCH_CHECK(
      status == xnn_status_success,
      func_name,
      ": xnn setup operator failed(",
      status,
      ")");

  // Run the operator
  status = xnn_run_operator(
      xnnp_linear_op.get(), // Linear op
      caffe2::pthreadpool_() // threadpool
  );
  TORCH_CHECK(
      status == xnn_status_success,
      func_name,
      ": xnn run operator failed(",
      status,
      ")");

  return output;
}
Source

View on GitHub
Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free