ReluFused Class — pytorch Architecture
Architecture documentation for the ReluFused class in qlinear_dynamic.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp lines 35–228
template <bool ReluFused>
at::Tensor PackedLinearWeight::apply_dynamic_impl(
at::Tensor input,
bool reduce_range) {
using at::Tensor;
// fp32 * int8 -> fp32 (with quantization on activation, and dequantization
// on the result).
// We make a strong guarantee that models using these operators will have
// the same numerics across different machines. Therefore, we do not provide
// a fallback path and rather fail loudly if we cannot run FBGEMM.
TORCH_CHECK(
fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
// TODO: contiguous is called for further jit optimizations.
auto input_contig = input.contiguous();
const auto* input_ptr = input_contig.const_data_ptr<float>();
TORCH_CHECK(
input.dim() >= 2,
"The dimension of input tensor should be larger than or equal to 2");
// C(output) = A(input) x B(weight), where C, A, B are M x N, M x K, K x N
// matrices, respectively.
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
auto packB = w.get();
int64_t N = static_cast<int64_t>(packB->numCols());
int64_t K = input.size(input.dim() - 1);
TORCH_CHECK(
K == static_cast<int64_t>(packB->numRows()),
"The number of rows in the packB should be equal to K: " +
std::to_string(K));
// Calculate statistics for quantization of the input Tensor
float x_min = std::numeric_limits<float>::quiet_NaN(), x_max = std::numeric_limits<float>::quiet_NaN();
fbgemm::FindMinMax(
/*m=*/input_ptr,
/*min=*/&x_min,
/*max=*/&x_max,
/*len=*/input.numel());
// Input tensor is quantized as 8-bit unsigned values
static constexpr int precision = 8;
static constexpr bool is_signed = false;
// Calculate scale and zero point for quantization of input tensor
auto q_params = quant_utils::ChooseQuantizationParams(
/*min=*/x_min,
/*max=*/x_max,
/*qmin=*/is_signed ? -(1 << (precision - 1)) : 0,
/*qmax=*/
is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
/*preserve_sparsity=*/false,
/*force_scale_power_of_two=*/false,
/*reduce_range=*/reduce_range);
q_params.precision = precision;
// ReQuantizeForFloat requires pointers to the zero point values,
// since in the case of rowwise quantization these will be arrays rather
// than scalars. But in this case, we're doing whole-tensor quantization so
// we just pass a pointer to the scale values (and internally
// ReQuantizeForFloat won't index past 0.
const float* bias_ptr = nullptr;
at::Tensor bias_vec;
if (bias_.has_value()) {
bias_vec = bias_.value();
TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
TORCH_CHECK(
bias_vec.size(0) == N,
"bias should have N elements: " + std::to_string(N));
// TODO: contiguous is called for further jit optimizations.
auto bias_contig = bias_vec.contiguous();
bias_ptr = bias_contig.data_ptr<float>();
}
// The resulting matrix here is 2-D, let's view it with the original
// left hand dimensions of the input. Here are two examples:
// 1. If the input tensor is {M, K}, the output tensor is {M, N}.
// 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
std::vector<int64_t> out_sizes = input.sizes().vec();
out_sizes.back() = N;
// Allocate output Tensor and a buffer for fbgemmPacked to use
auto output = at::empty(out_sizes, input.options().dtype(at::kFloat));
auto buffer = at::empty_like(
output,
output.options().dtype(at::kInt),
LEGACY_CONTIGUOUS_MEMORY_FORMAT);
int num_tasks = at::get_num_threads();
at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
// This operation does the following:
// 1) Quantizes the input matrix given the statistics we've calculated
// above
// 2) Creates a "row buffer" vector with offset values that must be
// added
// to the integer matrix multiplication operation to ensure
// correctness. This "row buffer" is also called the row offset, and it
// is needed when we use affine quantization for weights.
// 3) Packs the resulting quantized matrix into vector-register and cache
// friendly tiles.
//
// Note this is not executed eagerly, but rather within the fbgemmPacked
// call below.
fbgemm::PackAWithQuantRowOffset<uint8_t> packA(
/*trans=*/fbgemm::matrix_op_t::NoTranspose,
/*nRow=*/M,
/*nCol=*/K,
/*smat=*/input_ptr,
/*ld=*/K,
/*pmat=*/nullptr, // Currently, packA manages ownership of `pmat`.
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
/*scale=*/q_params.scale,
/*zero_pt=*/q_params.zero_point);
// TODO: Consider a way to pre-allocate and reuse
// pmat buffer.
// This is the end of the pipeline, pass the resulting matrix through.
fbgemm::DoNothing<float, float> doNothingObj{};
for (const auto task_id : c10::irange(begin, end)) {
if (q_scheme == c10::kPerTensorAffine) {
// Process the per tensor quantization.
//
// After the uint8 * int8 matrix multiplication is performed, this
// operation does:
// 1) Add in row and column offsets to the rows and columns,
// respectively.
// 2) Dequantize the results into floating point.
// 3) Add in the bias term.
fbgemm::ReQuantizeForFloat<ReluFused> outputProcObj(
/*nextop=*/doNothingObj,
/*Aq_scale=*/q_params.scale,
/*Bq_scale=*/w_scale.data(),
/*Aq_zero_point=*/q_params.zero_point,
/*Bq_zero_point=*/w_zp.data(),
/*row_offsets=*/packA.getRowOffsetBuffer(),
/*col_offsets=*/col_offsets.data(),
/*bias=*/bias_ptr,
/*nCol=*/N);
// Do the GEMM
fbgemm::fbgemmPacked(
/*packA=*/packA,
/*packB=*/*packB,
/*C=*/output.data_ptr<float>(),
/*C_buffer=*/buffer.data_ptr<int32_t>(),
/*ldc=*/N,
/*outProcess=*/outputProcObj,
/*thread_id=*/task_id,
/*num_threads=*/num_tasks);
} else if (q_scheme == c10::kPerChannelAffine) {
// Process the per channel quantization.
//
// After the uint8 * int8 matrix multiplication is performed, this
// operation does:
// 1) Add in row and column offsets to the rows and columns,
// respectively.
// 2) Dequantize the results into floating point.
// 3) Add in the bias term.
fbgemm::ReQuantizeForFloat<
ReluFused,
fbgemm::QuantizationGranularity::OUT_CHANNEL>
outputProcObj(
/*nextop=*/doNothingObj,
/*Aq_scale=*/q_params.scale,
/*Bq_scale=*/w_scale.data(),
/*Aq_zero_point=*/q_params.zero_point,
/*Bq_zero_point=*/w_zp.data(),
/*row_offsets=*/packA.getRowOffsetBuffer(),
/*col_offsets=*/col_offsets.data(),
/*bias=*/bias_ptr,
/*nCol=*/N);
// Do the GEMM
fbgemm::fbgemmPacked(
/*packA=*/packA,
/*packB=*/*packB,
/*C=*/output.data_ptr<float>(),
/*C_buffer=*/buffer.data_ptr<int32_t>(),
/*ldc=*/N,
/*outProcess=*/outputProcObj,
/*thread_id=*/task_id,
/*num_threads=*/num_tasks);
}
}
});
return output;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free