cpu_hflip_vec Class — pytorch Architecture
Architecture documentation for the cpu_hflip_vec class in IndexKernel.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/IndexKernel.cpp lines 484–545
template <typename scalar_t>
void cpu_hflip_vec(at::TensorIterator& iter) {
auto loop2d = [&](char** base, const int64_t *strides, int64_t size0, int64_t size1) {
// Here ntensors is defined for output and 1 input. But tensor iterator has defined output, input
// and restrided_input (see aten/src/ATen/native/TensorTransformations.cpp#L64-L66) but we use only
// output and input.
static constexpr int ntensors = 2;
const int64_t *outer_strides = &strides[3];
std::array<char*, ntensors> data_arr;
std::copy_n(base, ntensors, data_arr.data());
using Vec = Vectorized<scalar_t>;
constexpr auto stride = sizeof(scalar_t);
TORCH_INTERNAL_ASSERT(stride == -strides[0] && stride == strides[1]);
for ([[maybe_unused]] const auto j : c10::irange(size1)) {
// vectorized loop with negative stride for output
int64_t n = size0;
int64_t i = 0;
// data_arr[0] unaligned pre-pass
int64_t offset = (j * n + (n - i - Vec::size())) % 32;
offset = (offset >= n) ? n : offset;
for (; i < offset; i++) {
scalar_t* out_ptr = (scalar_t*)(data_arr[0] - i * stride);
*out_ptr = c10::load((scalar_t *)(data_arr[1] + i * stride));
}
// Empirically found that it is faster to process 3 data items together vs 2 or 4
for (; i <= n - 3 * Vec::size(); i += 3 * Vec::size()) {
auto out1 = Vec::loadu(data_arr[1] + i * stride);
auto out2 = Vec::loadu(data_arr[1] + (i + Vec::size()) * stride);
auto out3 = Vec::loadu(data_arr[1] + (i + 2 * Vec::size()) * stride);
// flip the vector: 1234 -> 4321
out1 = flip(out1);
out2 = flip(out2);
out3 = flip(out3);
out1.store(data_arr[0] - (i + Vec::size() - 1) * stride);
out2.store(data_arr[0] - (i + 2 * Vec::size() - 1) * stride);
out3.store(data_arr[0] - (i + 3 * Vec::size() - 1) * stride);
}
if (i < n) {
for (; i < n; i++) {
scalar_t* out_ptr = (scalar_t*)(data_arr[0] - i * stride);
*out_ptr = c10::load((scalar_t *)(data_arr[1] + i * stride));
}
}
// advance:
for (const auto arg : c10::irange(ntensors)) {
data_arr[arg] += outer_strides[arg];
}
}
};
int64_t grain_size = at::internal::GRAIN_SIZE;
iter.for_each(loop2d, grain_size);
iter.cast_outputs();
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free