Vectorizedi Class — pytorch Architecture
Architecture documentation for the Vectorizedi class in vec256_int.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cpu/vec/vec256/vec256_int.h lines 46–181
template <>
class Vectorized<int64_t> : public Vectorizedi {
private:
static const Vectorized<int64_t> ones;
public:
using value_type = int64_t;
using size_type = int;
static constexpr size_type size() {
return 4;
}
using Vectorizedi::Vectorizedi;
Vectorized() {
values = _mm256_setzero_si256();
}
Vectorized(int64_t v) {
values = _mm256_set1_epi64x(v);
}
Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4) {
values = _mm256_setr_epi64x(val1, val2, val3, val4);
}
template <int64_t mask>
static Vectorized<int64_t> blend(
Vectorized<int64_t> a,
Vectorized<int64_t> b) {
__at_align__ int64_t tmp_values[size()];
a.store(tmp_values);
if (mask & 0x01)
tmp_values[0] = _mm256_extract_epi64(b.values, 0);
if (mask & 0x02)
tmp_values[1] = _mm256_extract_epi64(b.values, 1);
if (mask & 0x04)
tmp_values[2] = _mm256_extract_epi64(b.values, 2);
if (mask & 0x08)
tmp_values[3] = _mm256_extract_epi64(b.values, 3);
return loadu(tmp_values);
}
static Vectorized<int64_t> blendv(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b,
const Vectorized<int64_t>& mask) {
return _mm256_blendv_epi8(a.values, b.values, mask.values);
}
template <typename step_t>
static Vectorized<int64_t> arange(
int64_t base = 0,
step_t step = static_cast<step_t>(1)) {
return Vectorized<int64_t>(
base, base + step, base + 2 * step, base + 3 * step);
}
static Vectorized<int64_t> set(
Vectorized<int64_t> a,
Vectorized<int64_t> b,
int64_t count = size()) {
switch (count) {
case 0:
return a;
case 1:
return blend<1>(a, b);
case 2:
return blend<3>(a, b);
case 3:
return blend<7>(a, b);
}
return b;
}
static Vectorized<int64_t> loadu(const void* ptr) {
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
}
static Vectorized<int64_t> loadu(const void* ptr, int64_t count) {
__at_align__ int64_t tmp_values[size()];
// Ensure uninitialized memory does not change the output value See
// https://github.com/pytorch/pytorch/issues/32502 for more details. We do
// not initialize arrays to one using "={1}" because gcc would compile it
// to two instructions while a loop would be compiled to one instruction.
for (const auto i : c10::irange(size())) {
tmp_values[i] = 1;
}
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
return loadu(tmp_values);
}
void store(void* ptr, int count = size()) const {
if (count == size()) {
// ptr need not to be aligned here. See
// https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
} else if (count > 0) {
__at_align__ int64_t tmp_values[size()];
_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
}
}
const int64_t& operator[](int idx) const = delete;
int64_t& operator[](int idx) = delete;
Vectorized<int64_t> abs() const {
auto zero = _mm256_set1_epi64x(0);
auto is_larger = _mm256_cmpgt_epi64(zero, values);
auto inverse = _mm256_xor_si256(values, is_larger);
return _mm256_sub_epi64(inverse, is_larger);
}
Vectorized<int64_t> real() const {
return *this;
}
Vectorized<int64_t> imag() const {
return _mm256_set1_epi64x(0);
}
Vectorized<int64_t> conj() const {
return *this;
}
Vectorized<int64_t> neg() const;
Vectorized<int64_t> operator==(const Vectorized<int64_t>& other) const {
return _mm256_cmpeq_epi64(values, other.values);
}
Vectorized<int64_t> operator!=(const Vectorized<int64_t>& other) const {
return invert(_mm256_cmpeq_epi64(values, other.values));
}
Vectorized<int64_t> operator<(const Vectorized<int64_t>& other) const {
return _mm256_cmpgt_epi64(other.values, values);
}
Vectorized<int64_t> operator<=(const Vectorized<int64_t>& other) const {
return invert(_mm256_cmpgt_epi64(values, other.values));
}
Vectorized<int64_t> operator>(const Vectorized<int64_t>& other) const {
return _mm256_cmpgt_epi64(values, other.values);
}
Vectorized<int64_t> operator>=(const Vectorized<int64_t>& other) const {
return invert(_mm256_cmpgt_epi64(other.values, values));
}
Vectorized<int64_t> eq(const Vectorized<int64_t>& other) const;
Vectorized<int64_t> ne(const Vectorized<int64_t>& other) const;
Vectorized<int64_t> gt(const Vectorized<int64_t>& other) const;
Vectorized<int64_t> ge(const Vectorized<int64_t>& other) const;
Vectorized<int64_t> lt(const Vectorized<int64_t>& other) const;
Vectorized<int64_t> le(const Vectorized<int64_t>& other) const;
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free