RMSNormBackward Class — pytorch Architecture

Architecture documentation for the RMSNormBackward class in kernels.py from the pytorch codebase.

Class python DynamoBenchmarks

Entity Profile

DynamoBenchmarks→ RMSNormBackward Class — pytorch Architecture

Relationship Graph

Source Code

benchmarks/dynamo/genai_layers/kernels.py lines 448–568

class RMSNormBackward(BenchmarkKernel):
    def __init__(self, script_args):
        super().__init__(script_args)
        self.available_backends = [
            "eager",
            "compiled",
            "quack",
            "liger",
        ]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
        # TODO: OOM for (32768, 65536) on h100
        return (
            (32768, 256),
            (32768, 512),
            (32768, 1024),
            (32768, 2048),
            (32768, 4096),
            (32768, 8192),
            (32768, 16384),
        ) + extra_shapes_for_norm

    def get_memory_bytes(self, args, kwargs) -> int:
        x, w, dy = args
        # x, dy: [M, N], w: [N]
        M, N = x.shape
        # Read x, w, dy, write dx, dw
        return 3 * M * N * x.dtype.itemsize + 2 * N * w.dtype.itemsize

    def rms_norm_ref(self, x, w):
        x_f32 = x.float()
        return (
            x_f32
            * torch.rsqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6)
            * w
        ).to(x.dtype)

    def eager(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        x, w, dy = args
        y = self.rms_norm_ref(x, w)
        return lambda: torch.autograd.grad(
            y, [x, w], grad_outputs=dy, retain_graph=True
        )

    def compiled(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        x, w, dy = args
        y = torch.compile(self.rms_norm_ref, mode=self.compile_mode, fullgraph=True)(
            x, w
        )
        return lambda: torch.autograd.grad(
            y, [x, w], grad_outputs=dy, retain_graph=True
        )

    def compute_rstd(self, x, eps):
        return torch.rsqrt(torch.mean(x.float().square(), dim=-1, keepdim=True) + eps)

    def quack(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        from quack.rmsnorm import _get_sm_count, _rmsnorm_bwd

        (
            x,
            w,
            dy,
        ) = args
        M, N = x.shape

        rstd = self.compute_rstd(x, eps=1e-6)
        dx = torch.empty_like(x)
        sm_count = _get_sm_count(x.size(1), x.device)
        dw_partial = torch.empty(
            sm_count, x.size(1), device=x.device, dtype=torch.float32
        )

        def quack_bwd():
            _rmsnorm_bwd(
                x,
                w,
                dy,
                rstd,
                dx,
                dw_partial,
                db_partial=None,
                dresidual_out=None,
                dresidual=None,
                sm_count=sm_count,
            )
            dw = dw_partial.sum(dim=0).to(w.dtype)
            return dx, dw

        return quack_bwd

    def liger(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        from liger_kernel.transformers.rms_norm import LigerRMSNorm

        x, w, dy = args
        M, N = x.shape
        liger_rmsnorm = LigerRMSNorm(
            hidden_size=N, eps=1e-6, casting_mode="gemma"
        ).cuda()
        liger_rmsnorm.weight.data.copy_(w)
        y = liger_rmsnorm(x)
        return lambda: torch.autograd.grad(
            y, [x, liger_rmsnorm.weight], grad_outputs=dy, retain_graph=True
        )

    def benchmark(self):
        for M, N in self.get_shapes():
            print(f"Tensor dimensions: [{M}, {N}]")
            torch_dtype = cutlass_torch.dtype(cutlass.BFloat16)
            x = torch.randn(M, N, device="cuda", dtype=torch_dtype, requires_grad=True)
            w = torch.randn(N, device="cuda", dtype=torch.float32, requires_grad=True)
            dy = torch.randn(M, N, device="cuda", dtype=torch_dtype)
            self.benchmark_single_shape((x, w, dy), setting=f"shape: [{M}, {N}]")

Domain

DynamoBenchmarks

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free