ReluFused Class — pytorch Architecture
Architecture documentation for the ReluFused class in Normalization.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/quantized/cpu/Normalization.cpp lines 55–161
template <bool ReluFused>
Tensor q_batch_norm1d_impl(
Tensor qx,
std::optional<Tensor> mb_weight,
std::optional<Tensor> mb_bias,
Tensor mean,
Tensor var,
double eps,
double output_scale,
int64_t output_zero_point) {
TORCH_CHECK(mb_weight.has_value(), "Weight must be provided");
TORCH_CHECK(mb_bias.has_value(), "Bias must be provided");
const auto& weight = *mb_weight;
const auto& bias = *mb_bias;
if (qx.numel() == 0) {
auto out = qx.clone();
return out;
}
int64_t ndim = qx.dim();
TORCH_CHECK(ndim == 2 || ndim == 3, "Expecting the input tensor of rank 2 or 3.");
const int64_t N = qx.size(0);
const int64_t C = qx.size(1);
const int64_t H = ndim == 3 ? qx.size(2) : 1;
TORCH_CHECK(weight.numel() == C, "Expect weight size to match C");
TORCH_CHECK(bias.numel() == C, "Expect weight size to match C");
const float* weight_data = weight.template const_data_ptr<float>();
const float* bias_data = bias.template const_data_ptr<float>();
TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension");
TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension");
Tensor alpha = at::empty_like(mean, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
Tensor beta = at::empty_like(mean, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
float* alpha_data = alpha.mutable_data_ptr<float>();
float* beta_data = beta.data_ptr<float>();
const float* mean_data = mean.template const_data_ptr<float>();
const float* var_data = var.template const_data_ptr<float>();
if (ndim == 2) {
// create a fake H and W dimension so we can use NHWC
qx = qx.unsqueeze(-1).unsqueeze(-1);
} else {
// create a fake W dimension so we can use NHWC
qx = qx.unsqueeze(-1);
}
auto oSizes = qx.sizes();
auto qx_nhwc = qx.contiguous(MemoryFormat::ChannelsLast);
Tensor qy = at::_empty_affine_quantized(
oSizes,
at::device(kCPU)
.dtype(qx_nhwc.scalar_type())
.memory_format(MemoryFormat::ChannelsLast),
output_scale,
output_zero_point,
std::nullopt);
compute_fused_params(
C,
weight_data,
bias_data,
mean_data,
var_data,
eps,
qx.q_scale(),
output_scale,
alpha_data,
beta_data);
if (ReluFused) {
qbatch_norm_relu_stub(
qx.device().type(),
N,
C,
H,
qx.q_zero_point(),
output_zero_point,
qx_nhwc,
alpha,
beta,
qy);
} else {
qbatch_norm_stub(
qx.device().type(),
N,
C,
H,
qx.q_zero_point(),
output_zero_point,
qx_nhwc,
alpha,
beta,
qy);
}
// Remove the fake dimension, and go back to contiguous format
// (since there is no 4th channel). Note, this has a performance
// cost.
Tensor result = qy.contiguous(MemoryFormat::Contiguous).squeeze(-1);
if (ndim == 2) {
result = result.squeeze(-1);
}
return result;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free