kReluFused Class — pytorch Architecture
Architecture documentation for the kReluFused class in qconv.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/quantized/cpu/qconv.cpp lines 263–539
template <bool kReluFused>
at::Tensor PackedConvWeight<kSpatialDim>::apply_impl(
const at::Tensor& act,
double output_scale,
int64_t output_zero_point) {
// Quantized kernels are all written with NHWC (channels last) layout in
// mind. Ideally, we'd be compatible with conv2d behavior and preserve the
// inputs layout as is (doing necessary upconversions).
//
// However, to be more robust, for now we just force output layout to always
// be NHWC (channels last), thus opportunistically improving perf.
//
// This might change when full memory format support lands
// See https://github.com/pytorch/pytorch/issues/23403
const std::string func_name = transpose() ? "quantized::conv_transpose"
: "quantized::conv";
TORCH_CHECK(
fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
TORCH_CHECK(act.scalar_type() == c10::kQUInt8,
func_name,
"(FBGEMM): Expected activation data type ",
toString(c10::kQUInt8),
" but got ",
toString(act.scalar_type()));
ConvDimChecks<kSpatialDim>(
act.ndimension(), stride().size(), padding().size(),
output_padding().size(), dilation().size(), func_name, transpose());
const int N = act.size(0);
const int C = act.size(1);
const int D = kSpatialDim == 2 ? 1 : act.size(2);
const int H = act.size(kSpatialDim);
const int W = act.size(kSpatialDim + 1);
const at::Tensor act_ndhwc = kSpatialDim == 2
? act.contiguous(c10::MemoryFormat::ChannelsLast)
: at::native::fbgemm_utils::ConvertToChannelsLast3dTensor(act);
const uint8_t* act_data =
reinterpret_cast<uint8_t*>(act_ndhwc.data_ptr<c10::quint8>());
auto* pack_w = w.get();
const int M = pack_w->outputChannels();
const int kernel_d = kSpatialDim == 2 ? 1 : kernel[0];
const int kernel_h = kernel[kSpatialDim - 2];
const int kernel_w = kernel[kSpatialDim - 1];
const int pad_d = kSpatialDim == 2 ? 0 : padding_[0];
const int pad_h = padding_[kSpatialDim - 2];
const int pad_w = padding_[kSpatialDim - 1];
const int stride_d = kSpatialDim == 2 ? 1 : stride_[0];
const int stride_h = stride_[kSpatialDim - 2];
const int stride_w = stride_[kSpatialDim - 1];
const int dilation_d = kSpatialDim == 2 ? 1 : dilation_[0];
const int dilation_h = dilation_[kSpatialDim - 2];
const int dilation_w = dilation_[kSpatialDim - 1];
const int output_padding_d = kSpatialDim == 2 ? 0 : output_padding_[0];
const int output_padding_h = output_padding_[kSpatialDim - 2];
const int output_padding_w = output_padding_[kSpatialDim - 1];
if (kSpatialDim == 2) {
TORCH_CHECK(
C == pack_w->inputChannels(),
"[QConv2D] Given groups=",
groups_,
", weight of size ",
M,
", ",
kernel_h,
", ",
kernel_w,
", ",
pack_w->inputChannels(),
", expected input (NCHW) ",
N,
", ",
C,
", ",
H,
", ",
W,
" to have ",
pack_w->inputChannels(),
" channels, but got ",
C,
" channels instead");
} else {
TORCH_CHECK(
C == pack_w->inputChannels(),
"[QConv3D] Given groups=",
groups_,
", weight of size ",
M,
", ",
kernel_d,
", ",
kernel_h,
", ",
kernel_w,
", ",
pack_w->inputChannels(),
", expected input (NCDHW) ",
N,
", ",
C,
", ",
D,
", ",
H,
", ",
W,
" to have ",
pack_w->inputChannels(),
" channels, but got ",
C,
" channels instead");
}
fbgemm::conv_param_t<kSpatialDim> conv_p =
at::native::fbgemm_utils::MakeFbgemmConvParam<kSpatialDim>(
N, // Batch size
C, // Number of input channels
M, // Number of output channels
kSpatialDim == 2 ? std::vector<int>{H, W} : std::vector<int>{D, H, W},
groups_,
kSpatialDim == 2 ? std::vector<int>{kernel_h, kernel_w}
: std::vector<int>{kernel_d, kernel_h, kernel_w},
kSpatialDim == 2 ? std::vector<int>{stride_h, stride_w}
: std::vector<int>{stride_d, stride_h, stride_w},
kSpatialDim == 2 ? std::vector<int>{pad_h, pad_w}
: std::vector<int>{pad_d, pad_h, pad_w},
kSpatialDim == 2
? std::vector<int>{dilation_h, dilation_w}
: std::vector<int>{dilation_d, dilation_h, dilation_w},
kSpatialDim == 2
? std::vector<int>{output_padding_h, output_padding_w}
: std::vector<int>{output_padding_d,
output_padding_h,
output_padding_w},
transpose());
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
const float act_scale = act.q_scale();
const int32_t act_zero_point = act.q_zero_point();
at::Tensor bias;
const float* bias_data = GetBiasData(&bias);
TORCH_CHECK(
w_scale.size() == w_zp.size(),
"Weight scales and zero points vectors should have the same size.");
std::vector<float> output_multiplier_float;
std::vector<float> act_times_w_scale;
GetQuantizationParams(
act_scale, output_scale, &output_multiplier_float, &act_times_w_scale);
at::SmallVector<int64_t, kSpatialDim + 2> output_shape;
if (transpose()) {
output_shape = MakeDeConvOutputShape<kSpatialDim>(
N,
M,
kSpatialDim == 2 ? std::vector<int64_t>{H, W} : std::vector<int64_t>{D, H, W},
kernel,
stride(),
padding(),
output_padding(),
dilation());
// if use direct convolution implementation, compute the col_offsets
// of the weight matrix at model initialization stage.
// We need to know the shape of output matrix
// to compute col_offsets for direct convolution.
// Hence it cannot be called from inside weight packing function
// like other quantized conv implementation
if (pack_w->getPackedWForDirectconv().get() &&
pack_w->getPackedWForDirectconv().get()->is_first_call()) {
pack_w->getPackedWForDirectconv().get()->col_offsets_with_zero_pt_s8acc32_DirectConvT(
conv_p,
w_zp.data(),
col_offsets,
M);
}
} else {
output_shape = MakeConvOutputShape<kSpatialDim>(N, M, conv_p.OUT_DIM);
}
if (N > 0) {
TORCH_CHECK(
std::all_of(
output_shape.begin(),
output_shape.end(),
[](int64_t i) { return i > 0; }),
"[QConv",
kSpatialDim,
"D] each dimension of output tensor should be greater than 0");
}
at::Tensor output = kSpatialDim == 2
? at::_empty_affine_quantized(
output_shape,
at::device(c10::kCPU)
.dtype(c10::kQUInt8)
.memory_format(c10::MemoryFormat::ChannelsLast),
output_scale,
output_zero_point,
std::nullopt)
: at::native::fbgemm_utils::MakeEmptyAffineQuantizedChannelsLast3dTensor(
output_shape[0],
output_shape[1],
output_shape[2],
output_shape[3],
output_shape[4],
at::device(c10::kCPU).dtype(c10::kQUInt8),
output_scale,
output_zero_point);
at::Tensor buffer =
at::empty(output.sizes(), output.options().dtype(c10::kInt));
const int num_tasks = at::get_num_threads();
at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
fbgemm::DoNothing<> kNoOpObj{};
for (const auto task_id : c10::irange(begin, end)) {
if (q_scheme == c10::kPerTensorAffine) {
fbgemm::ReQuantizeOutput<
kReluFused,
fbgemm::QuantizationGranularity::TENSOR,
float>
output_proc_obj(
kNoOpObj,
output_multiplier_float.data(),
output_zero_point,
act_zero_point,
w_zp.data(),
nullptr, /* row offset buffer */
col_offsets.data(),
bias_data,
M,
groups_,
act_times_w_scale.data());
fbgemm::fbgemmConv<decltype(output_proc_obj), kSpatialDim, int32_t>(
conv_p,
act_data,
*pack_w,
reinterpret_cast<uint8_t*>(output.data_ptr<c10::quint8>()),
buffer.data_ptr<int32_t>(),
output_proc_obj,
task_id /* thread_id*/,
num_tasks /* num_threads */);
} else if (q_scheme == c10::kPerChannelAffine) {
fbgemm::ReQuantizeOutput<
kReluFused,
fbgemm::QuantizationGranularity::OUT_CHANNEL,
float>
output_proc_obj(
kNoOpObj,
output_multiplier_float.data(),
output_zero_point,
act_zero_point,
w_zp.data(),
nullptr, /* row offset buffer */
col_offsets.data(),
bias_data,
M,
groups_,
act_times_w_scale.data());
fbgemm::fbgemmConv<decltype(output_proc_obj), kSpatialDim, int32_t>(
conv_p,
act_data,
*pack_w,
reinterpret_cast<uint8_t*>(output.data_ptr<c10::quint8>()),
buffer.data_ptr<int32_t>(),
output_proc_obj,
task_id /* thread_id*/,
num_tasks /* num_threads */);
}
}
});
return output;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free