mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[dynamo][benchmarks] Stop benchmarking compile time of dead code (#145590)
FIXES https://github.com/pytorch/pytorch/issues/144775 frfr See details on the problem: https://github.com/pytorch/pytorch/issues/144775#issuecomment-2611699385 We fixed some silent incorrectness, but it results in less nodes DCE'd. The benchmark iteration loop had some dead code which could contain side effect ops that aren't safe to DCE. The regression is expected. This PR removes the compile time benchmarking of the dead code, which should reduce the noise of the benchmark and aligns with the benchmarking used by performance tests New benchmark results: ```python dev,name,batch_size,accuracy,calls_captured,unique_graphs,graph_breaks,unique_graph_breaks,autograd_captures,autograd_compiles,cudagraph_skips,compilation_latency cuda,BartForConditionalGeneration,1,pass,897,1,0,0,0,0,0,39.322364 # after https://github.com/pytorch/pytorch/pull/144319 cuda,BartForConditionalGeneration,1,pass,897,1,0,0,0,0,0,38.972257 # before https://github.com/pytorch/pytorch/pull/144319 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/145590 Approved by: https://github.com/jansel ghstack dependencies: #145447
This commit is contained in:
committed by
PyTorch MergeBot
parent
793dfc27e0
commit
e02c038a23
@ -2781,11 +2781,11 @@ class BenchmarkRunner:
|
||||
batch_size = self.decay_batch_exp(batch_size)
|
||||
return 1
|
||||
|
||||
def run_n_iterations(self, mod, inputs):
|
||||
def run_n_iterations(self, mod, inputs, model_iter_fn):
|
||||
n = self.args.iterations
|
||||
for _ in range(n - 1):
|
||||
self.model_iter_fn(mod, inputs, collect_outputs=False)
|
||||
return self.model_iter_fn(mod, inputs, collect_outputs=True)
|
||||
model_iter_fn(mod, inputs, collect_outputs=False)
|
||||
return model_iter_fn(mod, inputs, collect_outputs=True)
|
||||
|
||||
@torch._disable_dynamo(recursive=True)
|
||||
def optimizer_zero_grad(self, mod):
|
||||
@ -2953,7 +2953,9 @@ class BenchmarkRunner:
|
||||
clone_inputs(example_inputs),
|
||||
)
|
||||
self.init_optimizer(name, current_device, model_fp64.parameters())
|
||||
fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64)
|
||||
fp64_outputs = self.run_n_iterations(
|
||||
model_fp64, inputs_fp64, self.model_iter_fn
|
||||
)
|
||||
fp64_outputs = tree_map(
|
||||
lambda x: x.to(torch.float64)
|
||||
if isinstance(x, torch.Tensor) and x.is_floating_point()
|
||||
@ -2986,7 +2988,7 @@ class BenchmarkRunner:
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs)
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
@ -3007,7 +3009,7 @@ class BenchmarkRunner:
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_rerun_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs)
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
@ -3066,13 +3068,15 @@ class BenchmarkRunner:
|
||||
)
|
||||
new_result = optimized_model_iter_fn(model_copy, example_inputs)
|
||||
else:
|
||||
optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
with maybe_enable_compiled_autograd(
|
||||
self.args.compiled_autograd,
|
||||
fullgraph=self.args.nopython,
|
||||
dynamic=self.args.dynamic_shapes,
|
||||
):
|
||||
new_result = optimized_model_iter_fn(model_copy, example_inputs)
|
||||
new_result = self.run_n_iterations(
|
||||
model_copy, example_inputs, optimized_model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
log.exception("")
|
||||
print(
|
||||
@ -3167,7 +3171,9 @@ class BenchmarkRunner:
|
||||
lambda x: x.to(base_device), example_inputs_copy
|
||||
)
|
||||
self.init_optimizer(name, base_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(model_copy, example_inputs_copy)
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, example_inputs_copy, self.model_iter_fn
|
||||
)
|
||||
|
||||
# Run with Dynamo
|
||||
# Sometime CI fails with random triton compilation failure which will be skipped for now
|
||||
@ -3176,8 +3182,10 @@ class BenchmarkRunner:
|
||||
torch._dynamo.reset()
|
||||
try:
|
||||
self.init_optimizer(name, current_device, model.parameters())
|
||||
optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
|
||||
new_result = optimized_model_iter_fn(model, example_inputs)
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
new_result = self.run_n_iterations(
|
||||
model_copy, example_inputs, optimized_model_iter_fn
|
||||
)
|
||||
except Exception:
|
||||
log.exception("")
|
||||
print(
|
||||
@ -4460,6 +4468,16 @@ def run(runner, args, original_dir=None):
|
||||
# Stricter check to disable fallbacks
|
||||
args.suppress_errors = False
|
||||
|
||||
if not args.disable_cudagraphs:
|
||||
runner.skip_models.update(
|
||||
{
|
||||
# xfail: https://github.com/pytorch/pytorch/issues/145773
|
||||
"convit_base",
|
||||
"llama",
|
||||
"cm3leon_generate",
|
||||
}
|
||||
)
|
||||
|
||||
if args.device_index is not None:
|
||||
if args.multiprocess:
|
||||
print("Cannot specify both --device_index and --multiprocess")
|
||||
|
@ -10,9 +10,9 @@ import warnings
|
||||
|
||||
|
||||
try:
|
||||
from .common import BenchmarkRunner, download_retry_decorator, main
|
||||
from .common import BenchmarkRunner, download_retry_decorator, load_yaml_file, main
|
||||
except ImportError:
|
||||
from common import BenchmarkRunner, download_retry_decorator, main
|
||||
from common import BenchmarkRunner, download_retry_decorator, load_yaml_file, main
|
||||
|
||||
import torch
|
||||
from torch._dynamo.testing import collect_results, reduce_to_scalar_loss
|
||||
@ -218,6 +218,18 @@ class TimmRunner(BenchmarkRunner):
|
||||
super().__init__()
|
||||
self.suite_name = "timm_models"
|
||||
|
||||
@property
|
||||
def _config(self):
|
||||
return load_yaml_file("timm_models.yaml")
|
||||
|
||||
@property
|
||||
def _skip(self):
|
||||
return self._config["skip"]
|
||||
|
||||
@property
|
||||
def skip_models(self):
|
||||
return self._skip["all"]
|
||||
|
||||
@property
|
||||
def force_amp_for_fp16_bf16_models(self):
|
||||
return FORCE_AMP_FOR_FP16_BF16_MODELS
|
||||
|
4
benchmarks/dynamo/timm_models.yaml
Normal file
4
benchmarks/dynamo/timm_models.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
# removing this file prevents the TimmRunner from dynamically skipping models
|
||||
skip:
|
||||
all:
|
||||
- ~
|
Reference in New Issue
Block a user