int4pack_mm_kernel_ Class — pytorch Architecture
Architecture documentation for the int4pack_mm_kernel_ class in int4mm_kernel.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/int4mm_kernel.cpp lines 697–759
template<typename T>
void int4pack_mm_kernel_(
const Tensor& C,
const Tensor& A,
const Tensor& B,
int qGroupSize,
const Tensor& qScaleAndZeros) {
const auto* A_data = A.const_data_ptr<T>();
const auto* B_data = reinterpret_cast<const uint8_t*>(B.const_data_ptr());
auto* C_data = C.data_ptr<T>();
const auto* S_data = qScaleAndZeros.const_data_ptr<T>();
int M = A.size(0);
int N = B.size(0);
int K = A.size(1);
constexpr int BLOCK_M = 4;
// 64 for avx512 and 32 for avx2/non-vectorized
constexpr int BLOCK_N = vec::Vectorized<float>::size() * 4;
// 32, 64, 128, 256
const int BLOCK_K = qGroupSize;
const int MB = (M + BLOCK_M - 1) / BLOCK_M;
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
int mb{0}, nb{0};
data_index_init(begin, mb, MB, nb, NB);
for ([[maybe_unused]] const auto i : c10::irange(begin, end)) {
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
int nb_start = nb * BLOCK_N;
int nb_size = std::min(BLOCK_N, N - nb_start);
const auto* A_ptr = A_data + mb_start * K;
const auto* B_ptr = B_data + nb_start * K / 2;
const auto* S_ptr = S_data + nb_start * 2;
auto* C_ptr = C_data + mb_start * N + nb_start;
switch (mb_size) {
case 1:
LAUNCH_TINYGEMM_NB_SIZE(1);
break;
case 2:
LAUNCH_TINYGEMM_NB_SIZE(2);
break;
case 3:
LAUNCH_TINYGEMM_NB_SIZE(3);
break;
case 4:
LAUNCH_TINYGEMM_NB_SIZE(4);
break;
default:
TORCH_CHECK(false, "Unsupported m block size: ", mb_size);
}
// move to the next index
data_index_step(mb, MB, nb, NB);
}
});
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free