vectorized_inner_sum Class — pytorch Architecture
Architecture documentation for the vectorized_inner_sum class in SumKernel.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/SumKernel.cpp lines 433–461
template <typename acc_t, typename VecLoadPolicy, typename ScalarLoadPolicy, typename StorePolicy>
void vectorized_inner_sum(
// NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
char * C10_RESTRICT data[2], int64_t outer_stride, int64_t out_stride,
int64_t size0, int64_t size1) {
using vacc_t = Vectorized<acc_t>;
constexpr int64_t vec_stride = VecLoadPolicy::memsize();
constexpr int64_t scalar_stride = ScalarLoadPolicy::memsize();
constexpr int64_t vec_numel = vec_stride / scalar_stride;
const int64_t vec_size = size0 / vec_numel;
// Input is contiguous over the first (reduced) dimension
for (const auto j : c10::irange(size1)) {
const auto *row_in = data[1] + j * outer_stride;
auto vec_acc = row_sum<vacc_t, VecLoadPolicy>(row_in, vec_stride, vec_size);
acc_t final_acc = 0;
for (int64_t k = vec_size * vec_numel; k < size0; ++k) {
final_acc += ScalarLoadPolicy::load(row_in, scalar_stride, k);
}
alignas(64) std::array<acc_t, vacc_t::size()> partials{};
vec_acc.store(partials.data());
for (const auto k : c10::irange(partials.size())) {
final_acc += partials[k];
}
store<StorePolicy>(data[0], out_stride, j, final_acc);
}
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free