count_nonzero_impl Class — pytorch Architecture
Architecture documentation for the count_nonzero_impl class in TensorAdvancedIndexing.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/TensorAdvancedIndexing.cpp lines 2787–2822
template <typename scalar_t>
static int64_t count_nonzero_impl(TensorIteratorBase& iter, Range range) {
int64_t num_nonzero = 0;
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
constexpr int ilp_factor = 4;
const char* ptr = data[0];
const auto stride = strides[0];
int64_t nonzero[ilp_factor] = {0};
int64_t i = 0;
for (; i + (ilp_factor - 1) < n; i += ilp_factor) {
c10::ForcedUnroll<ilp_factor>{}([&](int k) {
const auto& val = c10::load<scalar_t>(ptr + k * stride);
if (val != scalar_t(0)) {
++nonzero[k];
}
});
ptr += ilp_factor * stride;
}
for (; i < n; ++i) {
const auto& val = c10::load<scalar_t>(ptr);
if (val != scalar_t(0)) {
++nonzero[0];
}
ptr += stride;
}
for (const auto k : c10::irange(1, ilp_factor)) {
nonzero[0] += nonzero[k];
}
num_nonzero += nonzero[0];
};
iter.serial_for_each(loop, range);
return num_nonzero;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free