left_shift Class — pytorch Architecture
Architecture documentation for the left_shift class in vec256_int.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cpu/vec/vec256/vec256_int.h lines 1735–1862
template <bool left_shift>
Vectorized<int16_t> inline shift_256_16(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
// No vector instruction for shifting int16_t, so emulating it instead.
// Control masks for shuffle operation, treating 256 bits as an
// array of 16-bit elements, and considering pairs of neighboring
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
// M!=N) is set so that shuffle will move element with index M from
// input pair into element with index N in output pair, and element
// with index M in output pair will be set to all 0s.
__m256i ctl_0_1 = _mm256_set_epi8(
29,
28,
0x80,
0x80,
25,
24,
0x80,
0x80,
21,
20,
0x80,
0x80,
17,
16,
0x80,
0x80,
13,
12,
0x80,
0x80,
9,
8,
0x80,
0x80,
5,
4,
0x80,
0x80,
1,
0,
0x80,
0x80);
__m256i ctl_1_0 = _mm256_set_epi8(
0x80,
0x80,
31,
30,
0x80,
0x80,
27,
26,
0x80,
0x80,
23,
22,
0x80,
0x80,
19,
18,
0x80,
0x80,
15,
14,
0x80,
0x80,
11,
10,
0x80,
0x80,
7,
6,
0x80,
0x80,
3,
2);
// Masks for bitwise and operation, treating 256 bits as an array of
// 16-bit elements, and considering them in pairs of neighboring
// elements. A mask named "keep_M" (M in [0,1]) is set so that
// bitwise and will copy element with index M from input pair into
// element with the same index in output pair, while the other
// element in output pair will be set to all 0s.
__m256i keep_0 = _mm256_set1_epi32(0xFFFF);
__m256i keep_1 = _mm256_set1_epi32(0xFFFF0000);
// Take each 16-bit element with idx%2==0 from input array to be
// shifted and extend it to 32 bits so that 0s are added to the
// right. Then, perform shifting on this 32-bit number. Upper 16
// bits will be proper result of shifting original 16-bit number, so
// write them to result array, into the same position from which
// corresponding input element is taken. Also, make sure that
// result array elements with idx%2!=0 are set to all 0s.
//
// Note that number of bits to shift for is extended to 32 bits by
// adding 0s to the left. That means this number is not properly
// sign-extended for negative values. However, number of bits to
// shift is treated as an unsigned integer by respective shift
// intrinsics anyway so if negative then either with or without
// proper sign extension, it will be interpreted as a number greater
// than 32, and the shifting result will be the same.
__m256i a0 = _mm256_shuffle_epi8(a, ctl_0_1);
__m256i b0 = _mm256_and_si256(b, keep_0);
__m256i c0;
if (left_shift)
c0 = _mm256_sllv_epi32(a0, b0);
else
c0 = _mm256_srav_epi32(a0, b0);
c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
// Perform shifting the same way for input array elements with
// idx%2==1.
__m256i a1 = _mm256_and_si256(a, keep_1);
__m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
__m256i c1;
if (left_shift)
c1 = _mm256_sllv_epi32(a1, b1);
else
c1 = _mm256_srav_epi32(a1, b1);
c1 = _mm256_and_si256(c1, keep_1);
// Merge partial results into the final result.
__m256i c = _mm256_or_si256(c0, c1);
return c;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free