sweep() — pytorch Function Reference

Architecture documentation for the sweep() function in benchmark.py from the pytorch codebase.

Function python CoreTensor Registration calls 1 called by 1

Entity Profile

CoreTensor→ Registration→ sweep() — pytorch Function Reference

Dependency Diagram

graph TD
  8a4f57e1_634a_07c6_3a9a_c44f7ba2dbe1["sweep()"]
  32bc5158_e977_7f97_b62d_d5c3b8b333f7["main()"]
  32bc5158_e977_7f97_b62d_d5c3b8b333f7 -->|calls| 8a4f57e1_634a_07c6_3a9a_c44f7ba2dbe1
  5a01be28_6429_ebf7_fc47_02232b445a1f["run_benchmark()"]
  8a4f57e1_634a_07c6_3a9a_c44f7ba2dbe1 -->|calls| 5a01be28_6429_ebf7_fc47_02232b445a1f
  style 8a4f57e1_634a_07c6_3a9a_c44f7ba2dbe1 fill:#6366f1,stroke:#818cf8,color:#fff

Relationship Graph

Source Code

benchmarks/distributed/ddp/benchmark.py lines 100–152

def sweep(benchmark):
    # Synthesize the set of benchmarks to run.
    # This list contain tuples for ("string prefix", [rank...]).
    benchmarks = []

    def append_benchmark(prefix, ranks, opts=None):
        prefix = f"{len(ranks):4} GPUs -- {prefix}"
        benchmarks.append((prefix, ranks, opts))

    def local_print(msg):
        if dist.get_rank() == 0:
            print(msg, end="", flush=True)  # noqa: E999

    def print_header():
        local_print("\n")
        local_print(" " * 22)
        for _ in [50, 75, 90, 95]:
            local_print(f"{'sec/iter':14s}{'ex/sec':10s}")
        local_print("\n")

    def print_measurements(prefix, nelem, measurements):
        measurements = sorted(measurements)
        local_print(f"{prefix:8s}:")
        for p in [50, 75, 90, 95]:
            v = np.percentile(measurements, p)
            local_print(f"  p{p:02d}:  {v:1.3f}s  {nelem / v:6d}/s")
        local_print("\n")

    # Every process runs once by themselves to warm up (CUDA init, etc).
    append_benchmark("  warmup", [dist.get_rank()], {"use_ddp_for_single_rank": False})

    # Single machine baselines
    append_benchmark("  no ddp", range(1), {"use_ddp_for_single_rank": False})
    append_benchmark("   1M/1G", range(1))
    append_benchmark("   1M/2G", range(2))
    append_benchmark("   1M/4G", range(4))

    # Multi-machine benchmarks
    for i in range(1, (dist.get_world_size() // 8) + 1):
        append_benchmark(f"   {i:d}M/8G", range(i * 8))

    # Run benchmarks in order of increasing number of GPUs
    print_header()
    results = []
    for prefix, ranks, opts in sorted(benchmarks, key=lambda tup: len(tup[1])):
        # Turn range into materialized list.
        ranks = list(ranks)
        measurements = run_benchmark(benchmark, ranks, opts)
        if "warmup" not in prefix:
            print_measurements(prefix, benchmark.batch_size, measurements)
            results.append({"ranks": ranks, "measurements": measurements})

    return results