_cudnn_impl Class — pytorch Architecture
Architecture documentation for the _cudnn_impl class in RNN.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cudnn/RNN.cpp lines 2561–2638
template <typename hidden_type>
std::pair<Tensor, hidden_type> _cudnn_impl(
const Tensor& input,
const Tensor& _batch_sizes,
const hidden_type& hidden,
TensorList params,
bool has_biases,
cudnnRNNMode_t mode,
int64_t num_layers,
double dropout_p,
bool train,
bool bidirectional) {
auto [hx, cx] = unpack_hidden(hidden);
auto hidden_size = hx.sym_size(2);
SymInt proj_size = 0;
// For LSTM models with projections hidden size could be different
if (cx.defined() && cx.sym_size(2) != hx.sym_size(2)) {
hidden_size = cx.sym_size(2);
proj_size = hx.sym_size(2);
}
// TODO: try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a
// std::optional<Tensor> in weight_buf's slot. Do we want try_get_weight_buf
// to return a std::optional<Tensor> instead of a defined or undefined Tensor?
at::cuda::OptionalCUDAGuard guard(input.get_device());
auto weight_buf = try_get_weight_buf(
input,
params,
has_biases,
mode,
hidden_size,
proj_size,
num_layers,
bidirectional);
TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
TORCH_CHECK(
_batch_sizes.device().is_cpu(),
"batch_sizes tensor should be on CPU, but got ",
_batch_sizes.device());
IntArrayRef batch_sizes{
_batch_sizes.data_ptr<int64_t>(),
static_cast<size_t>(_batch_sizes.size(0))};
auto& dropout_state = get_dropout_state(dropout_p, train, input.options());
std::unique_lock<DropoutState> lock{dropout_state};
int64_t num_params = has_biases ? 4 : 2;
if (proj_size != 0) {
++num_params;
}
auto sym_batch_sizes = c10::SymIntArrayRef(
reinterpret_cast<const c10::SymInt*>(batch_sizes.data()),
batch_sizes.size());
// cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
auto cudnn_output = at::_cudnn_rnn_symint(
input,
params,
num_params,
weight_buf,
hx,
cx,
static_cast<int>(mode),
hidden_size,
proj_size,
num_layers,
/*batch_first=*/false,
dropout_p,
train,
bidirectional,
sym_batch_sizes,
dropout_state.buffer);
return {
std::move(std::get<0>(cudnn_output)),
pack_hidden<hidden_type>(
std::move(std::get<1>(cudnn_output)),
std::move(std::get<2>(cudnn_output)))};
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free