vectorized_reduction Class — pytorch Architecture
Architecture documentation for the vectorized_reduction class in Reduce.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/Reduce.h lines 36–68
template <typename func_t, typename vec_func_t>
inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
func_t op, vec_func_t vop, bool reduce) {
VEC_LOOP_HEADER(func_t, data)
const char* in1_ptr = data[1];
Vec acc[4];
for (const auto j : c10::irange(4)) {
acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t));
}
for (const auto i : c10::irange(1, n)) {
const char* ptr = in1_ptr + stride * i;
acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
}
if (reduce) {
scalar_t buffer[Vec::size()];
acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
acc[0].store(buffer);
for (const auto j : c10::irange(1, Vec::size())) {
buffer[0] = op(buffer[0], buffer[j]);
}
auto dst = (scalar_t*)out_ptr;
*dst = op(*dst, buffer[0]);
} else {
for (const auto j : c10::irange(4)) {
auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
acc[j] = vop(acc[j], Vec::loadu(dst));
acc[j].store(dst);
}
}
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free