vectorized_outer_sum Class — pytorch Architecture
Architecture documentation for the vectorized_outer_sum class in SumKernel.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/SumKernel.cpp lines 475–511
template <typename acc_t, typename VecLoadPolicy, typename ScalarLoadPolicy, typename StorePolicy>
void vectorized_outer_sum(
// NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
char * C10_RESTRICT data[2], int64_t inner_stride, int64_t out_stride,
int64_t size0, int64_t size1) {
using vacc_t = Vectorized<acc_t>;
constexpr int64_t scalar_stride = ScalarLoadPolicy::memsize();
constexpr int64_t vec_stride = VecLoadPolicy::memsize();
constexpr int64_t nrows = 4;
// Input is contiguous over the second (non-reduced) dimension
int64_t j = 0;
for (; j + nrows * vacc_t::size() <= size1; j += nrows * vacc_t::size()) {
const auto *row_in = data[1] + j * scalar_stride;
auto sums = multi_row_sum<vacc_t, nrows, VecLoadPolicy>(
row_in, inner_stride, vec_stride, size0);
for (const auto i : c10::irange(nrows)) {
const int64_t base_idx = j + i * vacc_t::size();
store<StorePolicy>(data[0], out_stride, base_idx, sums[i]);
}
}
for (; j + vacc_t::size() <= size1; j += vacc_t::size()) {
const auto *row_in = data[1] + j * scalar_stride;
const vacc_t sums = row_sum<vacc_t, VecLoadPolicy>(
row_in, inner_stride, size0);
store<StorePolicy>(data[0], out_stride, j, sums);
}
for (; j < size1; ++j) {
const auto *row_in = data[1] + j * scalar_stride;
auto ans = row_sum<acc_t, ScalarLoadPolicy>(row_in, inner_stride, size0);
store<StorePolicy>(data[0], out_stride, j, ans);
}
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free