CrossEntropyBackward Class — pytorch Architecture

Architecture documentation for the CrossEntropyBackward class in kernels.py from the pytorch codebase.

Class python DynamoBenchmarks

Entity Profile

DynamoBenchmarks→ CrossEntropyBackward Class — pytorch Architecture

Relationship Graph

Source Code

benchmarks/dynamo/genai_layers/kernels.py lines 121–210

class CrossEntropyBackward(BenchmarkKernel):
    def __init__(self, script_args):
        super().__init__(script_args)
        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
        return (
            (32768, 256),
            (32768, 512),
            (32768, 1024),
            (32768, 2048),
            (32768, 4096),
            (32768, 8192),
            (32768, 16384),
            (32768, 32768),
            (32768, 65536),
            (16384, 131072),
            (8192, 262144),
        )

    def get_memory_bytes(self, args, kwargs) -> int:
        # Read x (M*N elements) + read target (M elements) + read dloss (M elements) + write grad(M*N elements)
        x, target, dloss = args
        # Memory ba
        M, N = x.shape
        return (
            2 * M * N * x.dtype.itemsize
            + M * target.dtype.itemsize
            + M * dloss.dtype.itemsize
        )

    def eager(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        x, target, dloss = args
        loss = F.cross_entropy(x, target, reduction="none")
        return lambda: torch.autograd.grad(
            loss, x, grad_outputs=dloss, retain_graph=True
        )

    def compiled(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        x, target, dloss = args

        compiled_cross_entropy = torch.compile(
            lambda x, target: F.cross_entropy(x, target, reduction="none"),
            mode=self.compile_mode,
            fullgraph=True,
        )
        loss = compiled_cross_entropy(x, target)
        return lambda: torch.autograd.grad(
            loss, x, grad_outputs=dloss, retain_graph=True
        )

    def quack(self, args, kwargs=None) -> Any:
        from quack.cross_entropy import cross_entropy

        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        x, target, dloss = args
        loss = cross_entropy(x, target)
        return lambda: torch.autograd.grad(
            loss, x, grad_outputs=dloss, retain_graph=True
        )

    def liger(self, args, kwargs=None) -> Any:
        if kwargs is not None:
            raise AssertionError(f"Expected kwargs to be None, but got {kwargs}")
        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss

        x, target, dloss = args
        cross_entropy = LigerCrossEntropyLoss(reduction="none")
        loss = cross_entropy(x, target)
        return lambda: torch.autograd.grad(
            loss, x, grad_outputs=dloss, retain_graph=True
        )

    def benchmark(self):
        for M, N in self.get_shapes():
            print(f"Tensor dimensions: [{M}, {N}]")
            torch_dtype = cutlass_torch.dtype(cutlass.BFloat16)
            x = 0.1 * torch.randn(
                M, N, device="cuda", dtype=torch_dtype, requires_grad=True
            )
            target = torch.randint(0, N, (M,), device="cuda", dtype=torch.int64)
            dloss = torch.randn(M, device="cuda", dtype=torch.float32)
            self.benchmark_single_shape(
                (x, target, dloss), setting=f"shape: [{M}, {N}]"
            )

Domain

DynamoBenchmarks

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free