ReluFused Class — pytorch Architecture

Architecture documentation for the ReluFused class in Normalization.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/quantized/cpu/Normalization.cpp lines 55–161

template <bool ReluFused>
Tensor q_batch_norm1d_impl(
    Tensor qx,
    std::optional<Tensor> mb_weight,
    std::optional<Tensor> mb_bias,
    Tensor mean,
    Tensor var,
    double eps,
    double output_scale,
    int64_t output_zero_point) {

  TORCH_CHECK(mb_weight.has_value(), "Weight must be provided");
  TORCH_CHECK(mb_bias.has_value(), "Bias must be provided");
  const auto& weight = *mb_weight;
  const auto& bias = *mb_bias;

  if (qx.numel() == 0) {
    auto out = qx.clone();
    return out;
  }
  int64_t ndim = qx.dim();
  TORCH_CHECK(ndim == 2 || ndim == 3, "Expecting the input tensor of rank 2 or 3.");
  const int64_t N = qx.size(0);
  const int64_t C = qx.size(1);
  const int64_t H = ndim == 3 ? qx.size(2) : 1;

  TORCH_CHECK(weight.numel() == C, "Expect weight size to match C");
  TORCH_CHECK(bias.numel() == C, "Expect weight size to match C");

  const float* weight_data = weight.template const_data_ptr<float>();
  const float* bias_data = bias.template const_data_ptr<float>();

  TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension");
  TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension");

  Tensor alpha = at::empty_like(mean, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  Tensor beta = at::empty_like(mean, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  float* alpha_data = alpha.mutable_data_ptr<float>();
  float* beta_data = beta.data_ptr<float>();

  const float* mean_data = mean.template const_data_ptr<float>();
  const float* var_data = var.template const_data_ptr<float>();

  if (ndim == 2) {
    // create a fake H and W dimension so we can use NHWC
    qx = qx.unsqueeze(-1).unsqueeze(-1);
  } else {
    // create a fake W dimension so we can use NHWC
    qx = qx.unsqueeze(-1);
  }

  auto oSizes = qx.sizes();
  auto qx_nhwc = qx.contiguous(MemoryFormat::ChannelsLast);
  Tensor qy = at::_empty_affine_quantized(
      oSizes,
      at::device(kCPU)
        .dtype(qx_nhwc.scalar_type())
        .memory_format(MemoryFormat::ChannelsLast),
      output_scale,
      output_zero_point,
      std::nullopt);

  compute_fused_params(
      C,
      weight_data,
      bias_data,
      mean_data,
      var_data,
      eps,
      qx.q_scale(),
      output_scale,
      alpha_data,
      beta_data);
  if (ReluFused) {
    qbatch_norm_relu_stub(
        qx.device().type(),
        N,
        C,
        H,
        qx.q_zero_point(),
        output_zero_point,
        qx_nhwc,
        alpha,
        beta,
        qy);
  } else {
    qbatch_norm_stub(
        qx.device().type(),
        N,
        C,
        H,
        qx.q_zero_point(),
        output_zero_point,
        qx_nhwc,
        alpha,
        beta,
        qy);
  }
  // Remove the fake dimension, and go back to contiguous format
  // (since there is no 4th channel). Note, this has a performance
  // cost.
  Tensor result = qy.contiguous(MemoryFormat::Contiguous).squeeze(-1);
  if (ndim == 2) {
    result = result.squeeze(-1);
  }
  return result;
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free