run_performance_test_non_alternate() — pytorch Function Reference
Architecture documentation for the run_performance_test_non_alternate() function in common.py from the pytorch codebase.
Entity Profile
Dependency Diagram
graph TD c52cc8f1_b576_9d50_98d9_34f721215c0e["run_performance_test_non_alternate()"] 9bf8449e_2d7f_c370_514b_b3c7bf20f8e1["run_one_model()"] 9bf8449e_2d7f_c370_514b_b3c7bf20f8e1 -->|calls| c52cc8f1_b576_9d50_98d9_34f721215c0e da0c865a_ac14_7a10_8fc5_8a3b7509426d["maybe_cast()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| da0c865a_ac14_7a10_8fc5_8a3b7509426d 6a0a2015_4bf4_1e7a_daa6_dbf2c23883c7["deepcopy_and_maybe_parallelize()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| 6a0a2015_4bf4_1e7a_daa6_dbf2c23883c7 6c83aab9_f1ee_6751_91aa_682a715a5746["init_optimizer()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| 6c83aab9_f1ee_6751_91aa_682a715a5746 f00a6213_f2a3_0eef_9451_ee5a24d1ab9f["get_dynamo_stats()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| f00a6213_f2a3_0eef_9451_ee5a24d1ab9f 06ff896d_4db0_aa47_b3cd_be1da620f0ea["empty_gpu_cache()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| 06ff896d_4db0_aa47_b3cd_be1da620f0ea 7a733239_de08_527b_74a9_4a187c4bb634["get_peak_memory()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| 7a733239_de08_527b_74a9_4a187c4bb634 7d8e78e3_8a0c_828f_ff82_254897550856["write_csv_when_exception()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| 7d8e78e3_8a0c_828f_ff82_254897550856 b8cdd827_b831_469a_75e3_9eb4a7bb1874["output_signpost()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| b8cdd827_b831_469a_75e3_9eb4a7bb1874 ae211a9b_17fb_b7cc_e24a_8b7ff181116b["reset_counters()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| ae211a9b_17fb_b7cc_e24a_8b7ff181116b 2e9cdee4_896b_80c3_3a69_14e0ea0cfb08["maybe_snapshot_memory()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| 2e9cdee4_896b_80c3_3a69_14e0ea0cfb08 ad071b86_1922_09f7_c760_edbf58540ed3["get_excess_memory()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| ad071b86_1922_09f7_c760_edbf58540ed3 252d25b3_f1f5_8378_1d89_3019fda40b3e["latency_experiment_summary()"] c52cc8f1_b576_9d50_98d9_34f721215c0e -->|calls| 252d25b3_f1f5_8378_1d89_3019fda40b3e style c52cc8f1_b576_9d50_98d9_34f721215c0e fill:#6366f1,stroke:#818cf8,color:#fff
Relationship Graph
Source Code
benchmarks/dynamo/common.py lines 2583–2749
def run_performance_test_non_alternate(
self, name, model, example_inputs, optimize_ctx, experiment, tag=None
):
"Run performance test in non-alternately."
if experiment.func is not latency_experiment:
raise AssertionError(
f"Must run with latency_experiment, got {experiment.func}"
)
def warmup(fn, model, example_inputs, mode, niters=10):
gc.collect()
peak_mem = 0
start_stats = get_dynamo_stats()
try:
if current_device == "cuda":
torch.cuda.reset_peak_memory_stats()
empty_gpu_cache(current_device)
elif current_device == "hpu":
torch.hpu.reset_peak_memory_stats()
t0 = time.perf_counter()
for _ in range(niters):
fn(model, example_inputs)
t1 = time.perf_counter()
latency = t1 - t0
if current_device == "cuda":
peak_mem = get_peak_memory()
elif current_device == "hpu":
peak_mem = torch.hpu.max_memory_allocated() / 10**9
elif current_device == "cpu":
total = psutil.virtual_memory().total
percentage = psutil.Process(os.getpid()).memory_percent()
peak_mem = percentage * total / 10**9
except Exception:
log.exception("Backend %s failed in warmup()", mode)
write_csv_when_exception(
self.args, current_name, "warmup_failed", current_device
)
output_signpost({}, self.args, self.suite_name, error="warmup_failed")
return sys.exit(-1)
dynamo_stats = get_dynamo_stats()
dynamo_stats.subtract(start_stats)
return latency, peak_mem, dynamo_stats
# Cast the model to float16/float32 as necessary
model, example_inputs = self.maybe_cast(model, example_inputs)
# Use distributed wrapping as necessary
model = self.deepcopy_and_maybe_parallelize(model)
if not hasattr(model, name):
model.name = name
self.init_optimizer(name, current_device, model.parameters())
# The self.autocast context is needed for the model we export with aot_compile,
# similar to what we do in the check_accuracy function
ctx = (
self.autocast(**self.autocast_arg)
if self.args.export_aot_inductor
else contextlib.nullcontext()
)
with self.pick_grad(name, self.args.training), ctx:
ok, total = Stats.reset_counters()
experiment_kwargs = {}
if tag is not None:
experiment_kwargs["tag"] = tag
results = []
with maybe_snapshot_memory(
self.args.snapshot_memory, f"eager_{self.args.only}"
):
eager_latency, eager_peak_mem, _ = warmup(
self.model_iter_fn, model, example_inputs, "eager"
)
if self.args.use_warm_peak_memory:
_, eager_peak_mem, _ = warmup(
self.model_iter_fn, model, example_inputs, "eager", niters=1
)
baseline_timings = experiment(
self.model_iter_fn,
model,
example_inputs,
mark="expected",
**experiment_kwargs,
)
# reset dynamo
torch._dynamo.reset()
if self.args.export_aot_inductor:
optimized_model_iter_fn = optimize_ctx
else:
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
with maybe_snapshot_memory(
self.args.snapshot_memory, f"compiled_{self.args.only}"
):
dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
optimized_model_iter_fn, model, example_inputs, "dynamo"
)
if self.args.use_warm_peak_memory:
_, dynamo_peak_mem, _ = warmup(
optimized_model_iter_fn,
model,
example_inputs,
"dynamo",
niters=1,
)
# If we use warm peak memory, the AOT model loading transient memory
# won't be present on the warm measurement. We only have to account for
# it when using cold memory.
elif self.args.export_aot_inductor:
dynamo_peak_mem -= AOTInductorModelCache.get_excess_memory(model)
if self.args.profile_dynamo_cache_lookup:
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU]
) as prof:
warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")
events = list(
filter(
lambda event: "TorchDynamo Cache Lookup" in event.key,
prof.key_averages(),
)
)
dynamo_cache_lookup_latency = events[0].self_cpu_time_total
compilation_time = dynamo_latency - eager_latency
compression_ratio = (
eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.0
)
if self.args.print_memory:
print(
f"memory: eager: {eager_peak_mem:.2f} GB, "
f"dynamo: {dynamo_peak_mem:.2f} GB, "
f"ratio: {compression_ratio:.2f}"
)
if self.args.print_compilation_time:
print(f"Compilation time: {compilation_time:.2f}")
if experiment.func is speedup_experiment:
experiment_kwargs["compilation_latency"] = compilation_time
experiment_kwargs["compression_ratio"] = compression_ratio
experiment_kwargs["eager_peak_mem"] = eager_peak_mem
experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem
experiment_kwargs["dynamo_stats"] = dynamo_stats
if self.args.profile_dynamo_cache_lookup:
experiment_kwargs["cache_lookup_latency"] = (
dynamo_cache_lookup_latency
)
backend_timings = experiment(
self.model_iter_fn,
model,
example_inputs,
mark="expected",
**experiment_kwargs,
)
timings = np.stack((baseline_timings, backend_timings), axis=1)
result_summary = latency_experiment_summary(
self.suite_name, self.args, model, timings, **experiment_kwargs
)
results.append(result_summary)
return " ".join(map(str, results))
Domain
Subdomains
Calls
Called By
Source
Frequently Asked Questions
What does run_performance_test_non_alternate() do?
run_performance_test_non_alternate() is a function in the pytorch codebase.
What does run_performance_test_non_alternate() call?
run_performance_test_non_alternate() calls 12 function(s): deepcopy_and_maybe_parallelize, empty_gpu_cache, get_dynamo_stats, get_excess_memory, get_peak_memory, init_optimizer, latency_experiment_summary, maybe_cast, and 4 more.
What calls run_performance_test_non_alternate()?
run_performance_test_non_alternate() is called by 1 function(s): run_one_model.
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free