[dynamo][benchmarks] Stop benchmarking compile time of dead code (#145590)

FIXES https://github.com/pytorch/pytorch/issues/144775 frfr See details on the problem: https://github.com/pytorch/pytorch/issues/144775#issuecomment-2611699385 We fixed some silent incorrectness, but it results in less nodes DCE'd. The benchmark iteration loop had some dead code which could contain side effect ops that aren't safe to DCE. The regression is expected. This PR removes the compile time benchmarking of the dead code, which should reduce the noise of the benchmark and aligns with the benchmarking used by performance tests New benchmark results: ```python dev,name,batch_size,accuracy,calls_captured,unique_graphs,graph_breaks,unique_graph_breaks,autograd_captures,autograd_compiles,cudagraph_skips,compilation_latency cuda,BartForConditionalGeneration,1,pass,897,1,0,0,0,0,0,39.322364 # after https://github.com/pytorch/pytorch/pull/144319 cuda,BartForConditionalGeneration,1,pass,897,1,0,0,0,0,0,38.972257 # before https://github.com/pytorch/pytorch/pull/144319 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/145590 Approved by: https://github.com/jansel ghstack dependencies: #145447
2025-10-20 21:14:14 +08:00 · 2025-01-27 14:32:07 -08:00
parent 793dfc27e0
commit e02c038a23
3 changed files with 47 additions and 13 deletions
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -2781,11 +2781,11 @@ class BenchmarkRunner:
            batch_size = self.decay_batch_exp(batch_size)
        return 1

-    def run_n_iterations(self, mod, inputs):
+    def run_n_iterations(self, mod, inputs, model_iter_fn):
        n = self.args.iterations
        for _ in range(n - 1):
-            self.model_iter_fn(mod, inputs, collect_outputs=False)
-        return self.model_iter_fn(mod, inputs, collect_outputs=True)
+            model_iter_fn(mod, inputs, collect_outputs=False)
+        return model_iter_fn(mod, inputs, collect_outputs=True)

    @torch._disable_dynamo(recursive=True)
    def optimizer_zero_grad(self, mod):
@ -2953,7 +2953,9 @@ class BenchmarkRunner:
                    clone_inputs(example_inputs),
                )
                self.init_optimizer(name, current_device, model_fp64.parameters())
-                fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64)
+                fp64_outputs = self.run_n_iterations(
+                    model_fp64, inputs_fp64, self.model_iter_fn
+                )
                fp64_outputs = tree_map(
                    lambda x: x.to(torch.float64)
                    if isinstance(x, torch.Tensor) and x.is_floating_point()
@ -2986,7 +2988,7 @@ class BenchmarkRunner:
                model_copy = self.deepcopy_and_maybe_parallelize(model)
                self.init_optimizer(name, current_device, model_copy.parameters())
                correct_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs)
+                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
                )
            except Exception as e:
                accuracy_status = (
@ -3007,7 +3009,7 @@ class BenchmarkRunner:
                model_copy = self.deepcopy_and_maybe_parallelize(model)
                self.init_optimizer(name, current_device, model_copy.parameters())
                correct_rerun_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs)
+                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
                )
            except Exception as e:
                accuracy_status = (
@ -3066,13 +3068,15 @@ class BenchmarkRunner:
                        )
                        new_result = optimized_model_iter_fn(model_copy, example_inputs)
                else:
-                    optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
+                    optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
                    with maybe_enable_compiled_autograd(
                        self.args.compiled_autograd,
                        fullgraph=self.args.nopython,
                        dynamic=self.args.dynamic_shapes,
                    ):
-                        new_result = optimized_model_iter_fn(model_copy, example_inputs)
+                        new_result = self.run_n_iterations(
+                            model_copy, example_inputs, optimized_model_iter_fn
+                        )
            except Exception as e:
                log.exception("")
                print(
@ -3167,7 +3171,9 @@ class BenchmarkRunner:
                lambda x: x.to(base_device), example_inputs_copy
            )
            self.init_optimizer(name, base_device, model_copy.parameters())
-            correct_result = self.run_n_iterations(model_copy, example_inputs_copy)
+            correct_result = self.run_n_iterations(
+                model_copy, example_inputs_copy, self.model_iter_fn
+            )

            # Run with Dynamo
            # Sometime CI fails with random triton compilation failure which will be skipped for now
@ -3176,8 +3182,10 @@ class BenchmarkRunner:
            torch._dynamo.reset()
            try:
                self.init_optimizer(name, current_device, model.parameters())
-                optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
-                new_result = optimized_model_iter_fn(model, example_inputs)
+                optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
+                new_result = self.run_n_iterations(
+                    model_copy, example_inputs, optimized_model_iter_fn
+                )
            except Exception:
                log.exception("")
                print(
@ -4460,6 +4468,16 @@ def run(runner, args, original_dir=None):
        # Stricter check to disable fallbacks
        args.suppress_errors = False

+        if not args.disable_cudagraphs:
+            runner.skip_models.update(
+                {
+                    # xfail: https://github.com/pytorch/pytorch/issues/145773
+                    "convit_base",
+                    "llama",
+                    "cm3leon_generate",
+                }
+            )
+
    if args.device_index is not None:
        if args.multiprocess:
            print("Cannot specify both --device_index and --multiprocess")
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@ -10,9 +10,9 @@ import warnings


 try:
-    from .common import BenchmarkRunner, download_retry_decorator, main
+    from .common import BenchmarkRunner, download_retry_decorator, load_yaml_file, main
 except ImportError:
-    from common import BenchmarkRunner, download_retry_decorator, main
+    from common import BenchmarkRunner, download_retry_decorator, load_yaml_file, main

 import torch
 from torch._dynamo.testing import collect_results, reduce_to_scalar_loss
@ -218,6 +218,18 @@ class TimmRunner(BenchmarkRunner):
        super().__init__()
        self.suite_name = "timm_models"

+    @property
+    def _config(self):
+        return load_yaml_file("timm_models.yaml")
+
+    @property
+    def _skip(self):
+        return self._config["skip"]
+
+    @property
+    def skip_models(self):
+        return self._skip["all"]
+
    @property
    def force_amp_for_fp16_bf16_models(self):
        return FORCE_AMP_FOR_FP16_BF16_MODELS
--- a/benchmarks/dynamo/timm_models.yaml
+++ b/benchmarks/dynamo/timm_models.yaml
@ -0,0 +1,4 @@
+# removing this file prevents the TimmRunner from dynamically skipping models
+skip:
+  all:
+    - ~