Home / Class/ _cudnn_impl Class — pytorch Architecture

_cudnn_impl Class — pytorch Architecture

Architecture documentation for the _cudnn_impl class in RNN.cpp from the pytorch codebase.

Entity Profile

Source Code

aten/src/ATen/native/cudnn/RNN.cpp lines 2561–2638

template <typename hidden_type>
std::pair<Tensor, hidden_type> _cudnn_impl(
    const Tensor& input,
    const Tensor& _batch_sizes,
    const hidden_type& hidden,
    TensorList params,
    bool has_biases,
    cudnnRNNMode_t mode,
    int64_t num_layers,
    double dropout_p,
    bool train,
    bool bidirectional) {
  auto [hx, cx] = unpack_hidden(hidden);
  auto hidden_size = hx.sym_size(2);
  SymInt proj_size = 0;
  // For LSTM models with projections hidden size could be different
  if (cx.defined() && cx.sym_size(2) != hx.sym_size(2)) {
    hidden_size = cx.sym_size(2);
    proj_size = hx.sym_size(2);
  }

  // TODO:  try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a
  // std::optional<Tensor> in weight_buf's slot.  Do we want try_get_weight_buf
  // to return a std::optional<Tensor> instead of a defined or undefined Tensor?
  at::cuda::OptionalCUDAGuard guard(input.get_device());
  auto weight_buf = try_get_weight_buf(
      input,
      params,
      has_biases,
      mode,
      hidden_size,
      proj_size,
      num_layers,
      bidirectional);

  TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
  TORCH_CHECK(
      _batch_sizes.device().is_cpu(),
      "batch_sizes tensor should be on CPU, but got ",
      _batch_sizes.device());
  IntArrayRef batch_sizes{
      _batch_sizes.data_ptr<int64_t>(),
      static_cast<size_t>(_batch_sizes.size(0))};

  auto& dropout_state = get_dropout_state(dropout_p, train, input.options());
  std::unique_lock<DropoutState> lock{dropout_state};
  int64_t num_params = has_biases ? 4 : 2;
  if (proj_size != 0) {
    ++num_params;
  }
  auto sym_batch_sizes = c10::SymIntArrayRef(
      reinterpret_cast<const c10::SymInt*>(batch_sizes.data()),
      batch_sizes.size());
  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
  auto cudnn_output = at::_cudnn_rnn_symint(
      input,
      params,
      num_params,
      weight_buf,
      hx,
      cx,
      static_cast<int>(mode),
      hidden_size,
      proj_size,
      num_layers,
      /*batch_first=*/false,
      dropout_p,
      train,
      bidirectional,
      sym_batch_sizes,
      dropout_state.buffer);

  return {
      std::move(std::get<0>(cudnn_output)),
      pack_hidden<hidden_type>(
          std::move(std::get<1>(cudnn_output)),
          std::move(std::get<2>(cudnn_output)))};
}

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free