cpu_hflip_vec Class — pytorch Architecture

Architecture documentation for the cpu_hflip_vec class in IndexKernel.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/cpu/IndexKernel.cpp lines 484–545

template <typename scalar_t>
void cpu_hflip_vec(at::TensorIterator& iter) {

  auto loop2d = [&](char** base, const int64_t *strides, int64_t size0, int64_t size1) {

    // Here ntensors is defined for output and 1 input. But tensor iterator has defined output, input
    // and restrided_input (see aten/src/ATen/native/TensorTransformations.cpp#L64-L66) but we use only
    // output and input.
    static constexpr int ntensors = 2;
    const int64_t *outer_strides = &strides[3];

    std::array<char*, ntensors> data_arr;
    std::copy_n(base, ntensors, data_arr.data());

    using Vec = Vectorized<scalar_t>;

    constexpr auto stride = sizeof(scalar_t);
    TORCH_INTERNAL_ASSERT(stride == -strides[0] && stride == strides[1]);

    for ([[maybe_unused]] const auto j : c10::irange(size1)) {
      // vectorized loop with negative stride for output
      int64_t n = size0;
      int64_t i = 0;

      // data_arr[0] unaligned pre-pass
      int64_t offset = (j * n + (n - i - Vec::size())) % 32;
      offset = (offset >= n) ? n : offset;
      for (; i < offset; i++) {
        scalar_t* out_ptr = (scalar_t*)(data_arr[0] - i * stride);
        *out_ptr = c10::load((scalar_t *)(data_arr[1] + i * stride));
      }
      // Empirically found that it is faster to process 3 data items together vs 2 or 4
      for (; i <= n - 3 * Vec::size(); i += 3 * Vec::size()) {
        auto out1 = Vec::loadu(data_arr[1] + i * stride);
        auto out2 = Vec::loadu(data_arr[1] + (i + Vec::size()) * stride);
        auto out3 = Vec::loadu(data_arr[1] + (i + 2 * Vec::size()) * stride);
        // flip the vector: 1234 -> 4321
        out1 = flip(out1);
        out2 = flip(out2);
        out3 = flip(out3);
        out1.store(data_arr[0] - (i + Vec::size() - 1) * stride);
        out2.store(data_arr[0] - (i + 2 * Vec::size() - 1) * stride);
        out3.store(data_arr[0] - (i + 3 * Vec::size() - 1) * stride);
      }
      if (i < n) {
        for (; i < n; i++) {
          scalar_t* out_ptr = (scalar_t*)(data_arr[0] - i * stride);
          *out_ptr = c10::load((scalar_t *)(data_arr[1] + i * stride));
        }
      }

      // advance:
      for (const auto arg : c10::irange(ntensors)) {
        data_arr[arg] += outer_strides[arg];
      }
    }
  };

  int64_t grain_size = at::internal::GRAIN_SIZE;
  iter.for_each(loop2d, grain_size);
  iter.cast_outputs();
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free