ReluFused Class — pytorch Architecture
Architecture documentation for the ReluFused class in qlinear.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/quantized/cpu/qlinear.cpp lines 47–246
template <bool ReluFused>
at::Tensor& PackedLinearWeight::apply_impl(
const at::Tensor& input,
double output_scale,
int64_t output_zero_point,
at::Tensor& output) {
// uint8 * int8 -> uint8 (no quantization/dequantization)
// We make a strong guarantee that models using these operators will have
// the same numerics across different machines. Therefore, we do not provide
// a fallback path and rather fail loudly if we cannot run FBGEMM.
TORCH_CHECK(
fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
TORCH_CHECK(input.scalar_type() == c10::kQUInt8,
"Expected input data type ",
toString(c10::kQUInt8),
" but got ",
toString(input.scalar_type()));
// TODO: contiguous is called for further jit optimizations.
auto input_contig = input.expect_contiguous();
const auto* input_ptr =
reinterpret_cast<uint8_t*>(input_contig->data_ptr<c10::quint8>());
TORCH_CHECK(
input.dim() >= 2,
"The dimension of input tensor should be larger than or equal to 2");
// C(output) = A(input) x B(weight), where C, A, B are M x N, M x K, K x N
// matrices, respectively.
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
auto packB = w.get();
int64_t N = static_cast<int64_t>(packB->numCols());
int64_t K = input.sizes()[input.dim() - 1];
TORCH_CHECK(
K == static_cast<int64_t>(packB->numRows()),
"The number of rows in the packB should be equal to K: " +
std::to_string(K));
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float input_scale_float = input.q_scale();
int32_t input_zero_point_int32 = input.q_zero_point();
std::vector<float> output_multiplier_float(1, 0.0);
std::vector<float> act_times_w_scale(1, 0.0);
TORCH_CHECK(
w_scale.size() == w_zp.size(),
"Weight scales and zero points vectors should have the same size.");
if (q_scheme == c10::kPerTensorAffine) {
// Process the per tensor quantization.
act_times_w_scale[0] = (input_scale_float * w_scale[0]);
output_multiplier_float[0] =
act_times_w_scale[0] / static_cast<float>(output_scale);
} else if (q_scheme == c10::kPerChannelAffine) {
// Process the per channel quantization.
output_multiplier_float.resize(N, 0.0);
act_times_w_scale.resize(N, 1.0f);
for (const auto i : c10::irange(N)) {
act_times_w_scale[i] = (input_scale_float * w_scale[i]);
output_multiplier_float[i] =
act_times_w_scale[i] / static_cast<float>(output_scale);
}
}
int32_t output_zero_point_int32 = static_cast<int32_t>(output_zero_point);
const float* bias_ptr = nullptr;
c10::MaybeOwned<at::Tensor> bias_contig;
if (this->bias_.has_value()) {
auto& bias = this->bias_.value();
bias_contig = bias.expect_contiguous();
TORCH_CHECK(bias_contig->dim() == 1, "bias should be a vector (1D Tensor)");
TORCH_CHECK(
bias_contig->sizes()[0] == N, "bias should have N elements: " + std::to_string(N));
bias_ptr = reinterpret_cast<float*>(bias_contig->data_ptr<float>());
}
// The resulting matrix here is 2-D, let's view it with the original
// left hand dimensions of the input. Here are two examples:
// 1. If the input tensor is {M, K}, the output tensor is {M, N}.
// 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
at::DimVector out_sizes(input.sizes());
out_sizes.back() = N;
// Resize output Tensor
output.resize_(out_sizes);
// Allocate a buffer for fbgemmPacked to use
auto buffer = at::empty(out_sizes, output.options().dtype(at::kInt));
auto output_data = reinterpret_cast<uint8_t*>(output.data_ptr<c10::quint8>());
int num_tasks = at::get_num_threads();
at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
for (const auto task_id : c10::irange(begin, end)) {
// This operation does the following:
// 1) Creates a "row buffer" vector with offset values that must be
// added to the integer matrix multiplication operation to ensure
// correctness. This "row buffer" is also called the row offset, and
// it is needed when we use affine quantization for weights.
// 2) Packs the resulting quantized matrix into vector-register and
// cache friendly tiles.
//
// Note this is not executed eagerly, but rather within the
// fbgemmPacked call below.
fbgemm::PackAWithRowOffset<uint8_t> packA(
/*trans=*/fbgemm::matrix_op_t::NoTranspose,
/*nRow=*/M,
/*nCol=*/K,
/*smat=*/input_ptr,
/*ld=*/K,
/*pmat=*/nullptr); // Currently, packA manages ownership of `pmat`.
// TODO: Consider a way to pre-allocate and reuse
// pmat buffer.
// ReQuantizeOutput requires pointers to the zero point values,
// since in the case of rowwise quantization these will be arrays rather
// than scalars. But in this case, we're doing whole-tensor quantization
// so we just pass a pointer to the scale values (and internally
// ReQuantizeOutput won't index past 0.
// This is the end of the pipeline, pass the resulting matrix through.
fbgemm::DoNothing<> doNothingObj{};
if (q_scheme == c10::kPerTensorAffine) {
// Process the per tensor quantization.
//
// After the uint8 * int8 matrix multiplication is performed, this
// operation does:
// 1) Add in row and column offsets to the rows and columns,
// respectively.
// 2) Add in the bias term.
fbgemm::ReQuantizeOutput<
ReluFused,
fbgemm::QuantizationGranularity::TENSOR,
float>
outputProcObj(
doNothingObj,
output_multiplier_float.data(),
output_zero_point_int32,
input_zero_point_int32,
w_zp.data(),
packA.getRowOffsetBuffer(),
col_offsets.data(),
bias_ptr,
N, /* nCol */
1 /* groups */,
act_times_w_scale.data());
// Do the GEMM
fbgemm::fbgemmPacked(
/*packA=*/packA,
/*packB=*/*packB,
/*C=*/output_data,
/*C_buffer=*/buffer.data_ptr<int32_t>(),
/*ldc=*/N,
/*outProcess=*/outputProcObj,
/*thread_id=*/task_id,
/*num_threads=*/num_tasks);
} else if (q_scheme == c10::kPerChannelAffine) {
// Process the per channel quantization.
//
// After the uint8 * int8 matrix multiplication is performed, this
// operation does:
// 1) Add in row and column offsets to the rows and columns,
// respectively.
// 2) Add in the bias term.
fbgemm::ReQuantizeOutput<
ReluFused,
fbgemm::QuantizationGranularity::OUT_CHANNEL,
float>
outputProcObj(
doNothingObj,
output_multiplier_float.data(),
output_zero_point_int32,
input_zero_point_int32,
w_zp.data(),
packA.getRowOffsetBuffer(),
col_offsets.data(),
bias_ptr,
N, /*nCol=*/
1, /* groups*/
act_times_w_scale.data());
// Do the GEMM
fbgemm::fbgemmPacked(
/*packA=*/packA,
/*packB=*/*packB,
/*C=*/output_data,
/*C_buffer=*/buffer.data_ptr<int32_t>(),
/*ldc=*/N,
/*outProcess=*/outputProcObj,
/*thread_id=*/task_id,
/*num_threads=*/num_tasks);
}
}
});
return output;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free