vectorized_loop Class — pytorch Architecture
Architecture documentation for the vectorized_loop class in Loops.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/Loops.h lines 198–228
template <typename func_t, typename vec_func_t>
inline void
vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) {
using traits = function_traits<vec_func_t>;
using scalar_t = typename function_traits<func_t>::result_type;
using Vec = Vectorized<scalar_t>;
constexpr int ntensors = traits::arity + 1;
char* C10_RESTRICT data[ntensors];
for (const auto arg : c10::irange(ntensors)) {
data[arg] = data_[arg];
}
Vec opt_scalar = Vec(S > 0 ? c10::load((scalar_t*)data[S]) : scalar_t(0));
int64_t i = 0;
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
auto out1 = std::apply(vop, std::move(args1));
auto out2 = std::apply(vop, std::move(args2));
out1.store(data[0] + i * sizeof(scalar_t));
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
}
if (i < n) {
int64_t strides[ntensors];
for (const auto arg : c10::irange(ntensors)) {
strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t);
}
basic_loop(data, strides, i, n, std::forward<func_t>(op));
}
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free