kReluFused Class — pytorch Architecture
Architecture documentation for the kReluFused class in qlinear.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/quantized/cpu/qlinear.cpp lines 463–616
template <typename scalar_t, bool kReluFused>
at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
const at::Tensor& input,
double output_scale,
int64_t output_zero_point) {
using underlying_t = typename scalar_t::underlying;
std::lock_guard<std::mutex> lock(qnnp_mutex_);
const std::string func_name = kReluFused ? "quantized::linear_relu (xnnpack)"
: "quantized::linear (xnnpack)";
TORCH_CHECK(
input.dim() >= 2, func_name, ": Input tensor rank should be >= 2.");
TORCH_CHECK(
!per_channel(),
func_name,
": xnnpack does not currently have per_channel support.");
const auto input_contig = input.contiguous();
const auto input_scale = input_contig.q_scale();
const size_t rows_w = bias_.size(0);
const size_t cols_w = input_contig.size(input_contig.dim() - 1);
auto status = xnn_status_invalid_state;
// Create an operator iff not already created
if (!xnnp_linear_op ||
(!this->input_scale.has_value() ||
this->input_scale.value() != input_scale)) {
// Update the input scale so we may cache the op
this->input_scale = input_scale;
xnn_operator_t xnnp_op = nullptr;
const float* weight_scales_data = w_scales.const_data_ptr<float>();
// prepare weights
underlying_t w_zp = static_cast<underlying_t>(
orig_weight.q_zero_point() +
(std::is_same_v<underlying_t, uint8_t> ? 128 : 0));
at::Tensor xnnp_weight = at::_empty_affine_quantized(
orig_weight.sizes(),
c10::CppTypeToScalarType<scalar_t>::value,
weight_scales_data[0],
w_zp);
// copy from the original weight and take care of dtype change if necessary
at::native::xnnp_utils::q8_copy_int8_weight_and_add_offset<scalar_t>(
orig_weight, xnnp_weight);
// Original bias was float, so we requantize it here.
at::Tensor qbias = quant_utils::QuantizeBias(false, bias_, orig_weight, input_scale);
// output limits
auto output_min = kReluFused
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).first
: std::numeric_limits<underlying_t>::min();
auto output_max = kReluFused
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).second
: std::numeric_limits<underlying_t>::max();
// Create an operator
status = at::native::xnnp_utils::xnnp_create_fully_connected_nc(
cols_w, /* input_channels */
rows_w, /* output_channels */
cols_w, /* input_stride */
rows_w, /* output_stride */
input_contig.q_zero_point(),
input_contig.q_scale(),
w_zp,
weight_scales_data[0],
reinterpret_cast<const underlying_t*>(
xnnp_weight.template data_ptr<scalar_t>()),
reinterpret_cast<int32_t*>(qbias.data_ptr<c10::qint32>()),
output_zero_point,
output_scale,
output_min,
output_max,
0, /* flags */
&xnnp_op);
xnnp_linear_op = xnnpack_operator(xnnp_op);
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn create operator failed(",
status,
")");
}
/*
* Allocate output Tensor and a buffer for XNNPACK to use
* The resulting matrix here is 2-D, let's view it with the original
* left hand dimensions of the input. Here are two examples:
* 1. If the input tensor is {M, K}, the output tensor is {M, N}.
* 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
*/
std::vector<int64_t> out_sizes = input.sizes().vec();
out_sizes.back() = static_cast<int64_t>(rows_w);
at::Tensor output = at::native::empty_affine_quantized(
out_sizes,
c10::CppTypeToScalarType<scalar_t>::value,
std::nullopt /* layout */,
c10::kCPU,
std::nullopt /* pin_memory */,
output_scale,
output_zero_point,
input.suggest_memory_format());
// calculate batch_size
size_t rows_input = 1;
for (const auto i : c10::irange(input_contig.dim() - 1)) {
rows_input *= input_contig.size(i);
}
// Reshape the operator
status = at::native::xnnp_utils::xnnp_reshape_fully_connected_nc(
xnnp_linear_op.get(),
rows_input, /* batch_size */
caffe2::pthreadpool_());
// Setup the operator
status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
xnnp_linear_op.get(),
reinterpret_cast<const underlying_t*>(
input_contig.template data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>())
);
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn setup operator failed(",
status,
")");
// Run the operator
status = xnn_run_operator(
xnnp_linear_op.get(), // Linear op
caffe2::pthreadpool_() // threadpool
);
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn run operator failed(",
status,
")");
return output;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free