bernoulli_tensor_cuda_kernel Class — pytorch Architecture
Architecture documentation for the bernoulli_tensor_cuda_kernel class in DistributionTemplates.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cuda/DistributionTemplates.h lines 606–649
template<typename scalar_t, typename prob_t>
void bernoulli_tensor_cuda_kernel(
const TensorBase &ret, const at::TensorBase &p,
PhiloxCudaState philox_args) {
auto functor = [philox_args] __device__(
int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) {
auto seeds = at::cuda::philox::unpack(philox_args);
curandStatePhilox4_32_10_t state;
curand_init(std::get<0>(seeds),
blockIdx.x * blockDim.x + threadIdx.x,
std::get<1>(seeds),
&state);
// See Note [Register spilling in curand call for CUDA < 10]
float4 rand = curand_uniform4(&state);
switch (n) {
case 4: {
CUDA_KERNEL_ASSERT(0 <= p4 && p4 <= 1);
v4 = static_cast<scalar_t>(rand.w <= p4);
[[fallthrough]];
}
case 3: {
CUDA_KERNEL_ASSERT(0 <= p3 && p3 <= 1);
v3 = static_cast<scalar_t>(rand.z <= p3);
[[fallthrough]];
}
case 2: {
CUDA_KERNEL_ASSERT(0 <= p2 && p2 <= 1);
v2 = static_cast<scalar_t>(rand.y <= p2);
[[fallthrough]];
}
case 1: {
CUDA_KERNEL_ASSERT(0 <= p1 && p1 <= 1);
v1 = static_cast<scalar_t>(rand.x <= p1);
}
}
};
// The template argument `4` below indicates that we want to operate on four
// element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
at::cuda::CUDA_tensor_apply2<scalar_t, const prob_t, 4, decltype(functor),
/*max_threads_per_block=*/512,
/*min_blocks_per_sm==*/2>(ret, p, functor);
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free