mexp_impl Class — pytorch Architecture

Architecture documentation for the mexp_impl class in LinearAlgebra.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/LinearAlgebra.cpp lines 2642–2717

template <typename scalar_t>
Tensor mexp_impl(
  const Tensor& a,
  std::array<scalar_t, total_n_degs> thetas,
  bool compute_highest_degree_approx = false
) {
  const auto norm = operator_1_norm(a);
  const auto batch_size = a.size(0);
  if (batch_size > 1) {
    compute_highest_degree_approx = true;
  }

  if (!compute_highest_degree_approx) {
    // To prevent undefined behavior which outputs "normal" result from a matrix
    // contains NaN values, we put NaN values in `res`, so if input has NaN values,
    // its computation will be skipped to return the NaN contained `res` directly.
    auto res = at::full_like(a, std::numeric_limits<double>::quiet_NaN(), {},
                             at::MemoryFormat::Contiguous);
    // `norm_cpu` is used to decide which Tensors require which approximation
    // based on their norm. This decision takes place on CPU.
    // It requires moving data back and forth between devices when `a` is on CUDA,
    // but at the cost of only one single CPU-CUDA synchronization (instead of 6),
    // and better performance overall (benchmarked).
    const auto norm_cpu = (a.device().type() == at::kCUDA)
      ? norm.to(at::kCPU) : norm;

    constexpr std::array<
      Tensor(*)(const Tensor&),
      total_n_degs - 1>
    compute_Ts = {
      compute_T1, compute_T2, compute_T4<scalar_t>,
      compute_T8<scalar_t>, compute_T12<scalar_t>
    };

    for (int i = 0; i < total_n_degs - 1; ++i) {
      auto norm_lower_bound = (i == 0) ? static_cast<scalar_t>(-1) : thetas[i - 1];
      auto norm_upper_bound = thetas[i];
      // nonzero returns a 2D tensor, hence squeeze(-1) to make it 1D
      auto idx_curr_norm_interval = (
        (norm_lower_bound < norm_cpu) * (norm_cpu <= norm_upper_bound)
      ).nonzero().squeeze(-1);

      if (idx_curr_norm_interval.numel()) {
        auto idx_to_device = _move_memory_if_cuda_input(
          idx_curr_norm_interval, a
        );
        auto sub_a = at::index_select(a, 0, idx_to_device);
        res.index_put_({idx_to_device}, compute_Ts[i](sub_a));
      }
    }

    // nonzero returns a 2D tensor, hence squeeze(-1) to make it 1D
    auto idx_large_norm = (norm_cpu >= thetas[total_n_degs - 2])
      .nonzero().squeeze(-1);

    if (idx_large_norm.numel()) {
      auto idx_to_device = _move_memory_if_cuda_input(
        idx_large_norm, a
      );
      auto a_large_norm = at::index_select(a, 0, idx_to_device);
      auto large_norm_subset = at::index_select(norm, 0, idx_to_device);
      auto mexp_out = compute_T18_scale_square(
        a_large_norm,
        large_norm_subset,
        thetas[total_n_degs - 1]
      );
      res.index_put_({idx_large_norm}, mexp_out);
    }
    return res;
  }

  return compute_T18_scale_square(
    a, norm,
    thetas[total_n_degs - 1]
  );
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free