Home / Class/ serial_vec_log_softmax_lastdim_range Class — pytorch Architecture

serial_vec_log_softmax_lastdim_range Class — pytorch Architecture

Architecture documentation for the serial_vec_log_softmax_lastdim_range class in LogSoftmaxKernelImpl.h from the pytorch codebase.

Entity Profile

Source Code

aten/src/ATen/native/cpu/LogSoftmaxKernelImpl.h lines 30–97

template <typename scalar_t>
void serial_vec_log_softmax_lastdim_range(
    const scalar_t* input_data_base,
    scalar_t* output_data_base,
    int64_t dim_size,
    int64_t chunk_size,
    int64_t begin,
    int64_t end) {
  if (end <= begin) {
    return;
  }
  using Vec = vec::Vectorized<vec::vec_scalar_t<scalar_t>>;
  // MSVC requires such a declaration of dynamic arrays
  // Source: https://stackoverflow.com/a/33423538
  auto tmp_sum_scalar = std::make_unique<scalar_t[]>(chunk_size);
  auto max_input_arr = std::make_unique<scalar_t[]>(chunk_size);
  for (int64_t ii = begin; ii < end; ii += chunk_size) {
    int64_t loop_end = chunk_size;
    if (ii + chunk_size > end) {
      loop_end = end - ii;
    }
    for (const auto j : c10::irange(loop_end)) {
      int64_t i = ii + j;
      const scalar_t* input_data = input_data_base + i * dim_size;
      max_input_arr[j] = vec::reduce_all<scalar_t>(
          [](Vec& x, Vec& y) { return vec::maximum(x, y); },
          input_data,
          dim_size);
    }
    for (const auto j : c10::irange(loop_end)) {
      int64_t i = ii + j;
      const scalar_t* input_data = input_data_base + i * dim_size;
      scalar_t max_input = max_input_arr[j];
      tmp_sum_scalar[j] = vec::map_reduce_all<scalar_t>(
          [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
          [](Vec x, Vec y) { return x + y; },
          input_data,
          dim_size);
    }
    // See [Note AVX-SSE transitions] for why this should call the
    // vectorized version (aside from perf improvements).
    vec::map(
        [](Vec x) { return x.log(); },
        tmp_sum_scalar.get(),
        tmp_sum_scalar.get(),
        loop_end);
    for (const auto j : c10::irange(loop_end)) {
      int64_t i = ii + j;
      const scalar_t* input_data = input_data_base + i * dim_size;
      scalar_t* output_data = output_data_base + i * dim_size;
      scalar_t tmp_sum = tmp_sum_scalar[j];
      scalar_t max_input = max_input_arr[j];

      // It's necessary to keep the order of the operations below.
      // In some cases that input is large digits and the difference
      // is small, if we compute `max_input` plus `tmp_sum` before,
      // there would be a numerical problem. See an example in
      // https://github.com/pytorch/pytorch/issues/11752#issuecomment-422883379
      vec::map(
          [tmp_sum, max_input](Vec x) {
            return x - Vec(max_input) - Vec(tmp_sum);
          },
          output_data,
          input_data,
          dim_size);
    }
  }
}

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free