RMSNormBackward Class — pytorch Architecture
Architecture documentation for the RMSNormBackward class in kernels.py from the pytorch codebase.
Entity Profile
Relationship Graph
Source Code
benchmarks/dynamo/genai_layers/kernels.py lines 448–568
class RMSNormBackward(BenchmarkKernel):
def __init__(self, script_args):
super().__init__(script_args)
self.available_backends = [
"eager",
"compiled",
"quack",
"liger",
]
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
# TODO: OOM for (32768, 65536) on h100
return (
(32768, 256),
(32768, 512),
(32768, 1024),
(32768, 2048),
(32768, 4096),
(32768, 8192),
(32768, 16384),
) + extra_shapes_for_norm
def get_memory_bytes(self, args, kwargs) -> int:
x, w, dy = args
# x, dy: [M, N], w: [N]
M, N = x.shape
# Read x, w, dy, write dx, dw
return 3 * M * N * x.dtype.itemsize + 2 * N * w.dtype.itemsize
def rms_norm_ref(self, x, w):
x_f32 = x.float()
return (
x_f32
* torch.rsqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6)
* w
).to(x.dtype)
def eager(self, args, kwargs=None) -> Any:
if kwargs is not None:
raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
x, w, dy = args
y = self.rms_norm_ref(x, w)
return lambda: torch.autograd.grad(
y, [x, w], grad_outputs=dy, retain_graph=True
)
def compiled(self, args, kwargs=None) -> Any:
if kwargs is not None:
raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
x, w, dy = args
y = torch.compile(self.rms_norm_ref, mode=self.compile_mode, fullgraph=True)(
x, w
)
return lambda: torch.autograd.grad(
y, [x, w], grad_outputs=dy, retain_graph=True
)
def compute_rstd(self, x, eps):
return torch.rsqrt(torch.mean(x.float().square(), dim=-1, keepdim=True) + eps)
def quack(self, args, kwargs=None) -> Any:
if kwargs is not None:
raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
from quack.rmsnorm import _get_sm_count, _rmsnorm_bwd
(
x,
w,
dy,
) = args
M, N = x.shape
rstd = self.compute_rstd(x, eps=1e-6)
dx = torch.empty_like(x)
sm_count = _get_sm_count(x.size(1), x.device)
dw_partial = torch.empty(
sm_count, x.size(1), device=x.device, dtype=torch.float32
)
def quack_bwd():
_rmsnorm_bwd(
x,
w,
dy,
rstd,
dx,
dw_partial,
db_partial=None,
dresidual_out=None,
dresidual=None,
sm_count=sm_count,
)
dw = dw_partial.sum(dim=0).to(w.dtype)
return dx, dw
return quack_bwd
def liger(self, args, kwargs=None) -> Any:
if kwargs is not None:
raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
from liger_kernel.transformers.rms_norm import LigerRMSNorm
x, w, dy = args
M, N = x.shape
liger_rmsnorm = LigerRMSNorm(
hidden_size=N, eps=1e-6, casting_mode="gemma"
).cuda()
liger_rmsnorm.weight.data.copy_(w)
y = liger_rmsnorm(x)
return lambda: torch.autograd.grad(
y, [x, liger_rmsnorm.weight], grad_outputs=dy, retain_graph=True
)
def benchmark(self):
for M, N in self.get_shapes():
print(f"Tensor dimensions: [{M}, {N}]")
torch_dtype = cutlass_torch.dtype(cutlass.BFloat16)
x = torch.randn(M, N, device="cuda", dtype=torch_dtype, requires_grad=True)
w = torch.randn(N, device="cuda", dtype=torch.float32, requires_grad=True)
dy = torch.randn(M, N, device="cuda", dtype=torch_dtype)
self.benchmark_single_shape((x, w, dy), setting=f"shape: [{M}, {N}]")
Domain
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free