Home / Class/ LayerNormBackward Class — pytorch Architecture

LayerNormBackward Class — pytorch Architecture

Architecture documentation for the LayerNormBackward class in kernels.py from the pytorch codebase.

Entity Profile

Relationship Graph

Source Code

benchmarks/dynamo/genai_layers/kernels.py lines 647–734

class LayerNormBackward(BenchmarkKernel):
    def __init__(self, script_args):
        super().__init__(script_args)
        self.available_backends = ["eager", "compiled", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
        # OOM for (16384, 131072), (8192, 262144)
        return (
            (32768, 256),
            (32768, 512),
            (32768, 1024),
            (32768, 2048),
            (32768, 4096),
            (32768, 8192),
            (32768, 16384),
            (32768, 32768),
            (32768, 65536),
        ) + extra_shapes_for_norm

    def get_memory_bytes(self, args, kwargs) -> int:
        x, w, dy = args
        M, N = x.shape
        # Read x ([M, N]), w ([N]), dy ([M, N]), write dx ([M, N]), dw ([N])
        return (
            2 * M * N * x.dtype.itemsize
            + 2 * N * w.dtype.itemsize
            + M * N * dy.dtype.itemsize
        )

    def layernorm_ref(self, x: torch.Tensor, w: torch.Tensor, eps: float = 1e-6):
        x_f32 = x.float()
        return F.layer_norm(x_f32, w.shape, w, None, eps).to(x.dtype)

    def eager(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        x, w, dy = args
        y = self.layernorm_ref(x, w)
        return lambda: torch.autograd.grad(
            y, [x, w], grad_outputs=dy, retain_graph=True
        )

    def compiled(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        x, w, dy = args
        compiled_layernorm = torch.compile(
            self.layernorm_ref, mode=self.compile_mode, fullgraph=True
        )
        y = compiled_layernorm(x, w)
        return lambda: torch.autograd.grad(
            y, [x, w], grad_outputs=dy, retain_graph=True
        )

    def compute_mean_rstd(self, x, eps):
        x = x.float()

        var, mean = torch.var_mean(x, dim=-1, keepdim=True, correction=0)
        rstd = torch.rsqrt(var + eps)
        return mean, rstd

    def liger(self, args, kwargs) -> Any:
        """
        Call layer_norm_backward directly rather than calling
        liger_kernel.transformers.layer_norm.LigerLayerNorm and
        torch.autograd.grad.

        The latter fashion saves mean/rstd in x.dtype which can fail
        accuracy test. We call layer_norm_backward with fp32 mean and
        rstd.
        """
        from liger_kernel.ops.layer_norm import layer_norm_backward

        x, w, dy = args
        eps = 1e-6
        mean, rstd = self.compute_mean_rstd(x, eps)
        M, N = x.shape

        return lambda: layer_norm_backward(dy, x, w, None, mean, rstd)[0:2]

    def benchmark(self):
        for M, N in self.get_shapes():
            print(f"Tensor dimensions: [{M}, {N}]")
            torch_dtype = cutlass_torch.dtype(cutlass.BFloat16)
            x = torch.randn(M, N, device="cuda", dtype=torch_dtype, requires_grad=True)
            w = torch.randn(N, device="cuda", dtype=torch.float32, requires_grad=True)
            dy = torch.randn(M, N, device="cuda", dtype=torch_dtype)
            self.benchmark_single_shape((x, w, dy), setting=f"shape: [{M}, {N}]")

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free