scalar_t Class — pytorch Architecture
Architecture documentation for the scalar_t class in vec_half.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cpu/vec/vec_half.h lines 15–72
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 2>>
static inline void transpose_pad_2x32_block(
const scalar_t* src,
scalar_t* dst,
int64_t ld_src,
int krem = 2,
int nrem = 32) {
#if defined(CPU_CAPABILITY_AVX512)
__m512i r0, r1;
__m512i d0, d1;
// load
if (nrem < 32) {
__mmask32 mask_krem_v = (1LL << nrem) - 1;
r0 = _mm512_maskz_loadu_epi16(mask_krem_v, src);
// if krem is not 2, pad with zeros
if (krem == 2) {
r1 = _mm512_maskz_loadu_epi16(mask_krem_v, src + ld_src);
} else {
r1 = _mm512_setzero_si512();
}
} else {
r0 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src));
if (krem == 2) {
r1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + ld_src));
} else {
r1 = _mm512_setzero_si512();
}
}
// transpose
d0 = _mm512_unpacklo_epi16(r0, r1);
d1 = _mm512_unpackhi_epi16(r0, r1);
r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
// store
if (nrem < 16) {
__mmask32 mask_rem_v = (1LL << (nrem * 2)) - 1;
_mm512_mask_storeu_epi16(dst, mask_rem_v, d0);
} else if (nrem == 16) {
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
} else if (nrem < 32) {
__mmask32 mask_rem_v = (1LL << (nrem * 2 - 32)) - 1;
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
_mm512_mask_storeu_epi16(
reinterpret_cast<__m512i*>(dst + 32), mask_rem_v, d1);
} else {
// normal store
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1);
}
#else
TORCH_CHECK(
false,
"transpose_pad_2x32_block is only supported when avx512 is supported")
#endif
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free