unique_cpu_sorted_template Class — pytorch Architecture
Architecture documentation for the unique_cpu_sorted_template class in Unique.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/Unique.cpp lines 157–264
template <typename scalar_t, typename CompareOp>
std::tuple<Tensor, Tensor, Tensor> unique_cpu_sorted_template(
const Tensor& self,
const bool return_inverse,
const bool return_counts,
CompareOp is_unique) {
const Tensor& input = self.contiguous();
int64_t numel = input.numel();
Tensor output = at::empty({0}, self.options());
Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
Tensor counts = at::empty({0}, self.options().dtype(kLong));
if (numel == 0) {
if (return_inverse) {
inverse_indices.resize_(input.sizes());
}
return std::make_tuple(output, inverse_indices, counts);
}
// index of first unique in each consecutive section
// this is used to compute counts for parallelization purpose
Tensor unique_index = at::empty({0}, self.options().dtype(kLong));
// original behavior with unique on scalar tensor
// is to return a output size of ([1]), `flatten` here will do the job
auto input_flattened = input.flatten();
auto [input_sorted, indices] = input_flattened.sort();
scalar_t* input_sorted_data = input_sorted.data_ptr<scalar_t>();
int64_t* indices_data = indices.data_ptr<int64_t>();
int num_threads = at::get_num_threads();
std::vector<int64_t> unique_count_thread(num_threads, 0);
std::vector<int64_t> offset_thread(num_threads, 0);
const int64_t grain_size = at::internal::GRAIN_SIZE;
// calculate unique count from each thread
at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
int tid = at::get_thread_num();
for (const auto i : c10::irange(begin, end)) {
if (is_unique(input_sorted_data, i)) {
unique_count_thread[tid]++;
}
}
});
// calculate thread offset in output and
// `unique_count` records total count of uniques at last
int64_t unique_count = 0;
for (const auto t : c10::irange(num_threads)) {
offset_thread[t] = unique_count;
unique_count += unique_count_thread[t];
}
output.resize_({unique_count});
scalar_t* output_data = output.data_ptr<scalar_t>();
int64_t* inverse_indices_data = nullptr;
if (return_inverse) {
inverse_indices.resize_(input.sizes());
inverse_indices_data = inverse_indices.data_ptr<int64_t>();
}
int64_t* counts_data = nullptr;
int64_t* unique_index_data = nullptr;
if (return_counts) {
counts.resize_({unique_count});
counts_data = counts.data_ptr<int64_t>();
unique_index.resize_({unique_count + 1});
unique_index_data = unique_index.data_ptr<int64_t>();
unique_index_data[unique_count] = numel;
}
at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
int tid = at::get_thread_num();
int64_t offset = offset_thread[tid];
for (const auto i : c10::irange(begin, end)) {
if (is_unique(input_sorted_data, i)) {
output_data[offset] = c10::load(&input_sorted_data[i]);
if (return_counts) {
unique_index_data[offset] = i;
}
offset++;
}
if (return_inverse) {
int64_t inverse_index = offset - 1;
int64_t perm = indices_data[i];
inverse_indices_data[perm] = inverse_index;
}
}
});
if (return_counts) {
// do diff to get count
at::parallel_for(0, unique_count, grain_size, [&](int64_t begin, int64_t end) {
for (const auto i : c10::irange(begin, end)) {
counts_data[i] = unique_index_data[i + 1] - unique_index_data[i];
}
});
}
return std::make_tuple(output, inverse_indices, counts);
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free