angelayi
2025-09-14 07:41:06 +00:00
committed by PyTorch MergeBot
parent 84186c39ed
commit 972140b7e9
22 changed files with 533 additions and 47 deletions

View File

@ -72,6 +72,12 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
"timm_vovnet",
"torchrec_dlrm",
"vgg16",
# LLM
"meta-llama/Llama-3.2-1B",
"google/gemma-2-2b",
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
}
)

View File

@ -55,6 +55,12 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
"timm_nfnet",
"torchrec_dlrm",
"vgg16",
# LLM
"meta-llama/Llama-3.2-1B",
"google/gemma-2-2b",
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
}
)

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -167,3 +167,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,fail_accuracy,0
google/gemma-2-2b,fail_accuracy,0
google/gemma-3-4b-it,fail_accuracy,0
openai/whisper-tiny,fail_to_run,0
Qwen/Qwen3-0.6B,fail_accuracy,0

1 name accuracy graph_breaks
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
google/gemma-2-2b,pass_due_to_skip,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
google/gemma-2-2b,pass_due_to_skip,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
google/gemma-2-2b,pass_due_to_skip,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
google/gemma-2-2b,pass,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

View File

@ -733,7 +733,7 @@ def timed(
time_total = 0
# Dont collect outputs to correctly measure timing
for _ in range(times):
for i in range(times):
# If batch_size is 1, it too often collides with other non batch size
# dimensions resulting in errors.
if batch_size and batch_size > 1:
@ -1106,7 +1106,13 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
elif args.torchscript_jit_trace:
frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
else:
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
if kwargs["hf_llm"]:
# If it's an llm, we want to optimize model.forward, and use
# the generate function
model.forward = torch._dynamo.run(model)
frozen_model_iter_fn = model_iter_fn
else:
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
for rep in trange(args.repeat, desc="running benchmark"):
inputs = (
@ -1120,7 +1126,10 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
maybe_mark_step(args)
# interleave the runs to handle frequency scaling and load changes
with maybe_mark_profile(p=p, mark="expected"):
with (
maybe_mark_profile(p=p, mark="expected"),
torch.compiler.set_stance("force_eager"),
):
timings[rep, 0], expected_output = timed(
model,
model_iter_fn,
@ -2233,11 +2242,12 @@ class BenchmarkRunner:
reset_rng_state()
model_copy = None
try:
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
with torch.compiler.set_stance("force_eager"):
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
except Exception as e:
accuracy_status = (
"eager_1st_run_OOM"
@ -2254,11 +2264,12 @@ class BenchmarkRunner:
reset_rng_state()
model_copy = None
try:
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_rerun_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
with torch.compiler.set_stance("force_eager"):
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_rerun_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
except Exception as e:
accuracy_status = (
"eager_2nd_run_OOM"
@ -2542,7 +2553,11 @@ class BenchmarkRunner:
)
baseline_timings = experiment(
model, example_inputs, mark="expected", **experiment_kwargs
self.model_iter_fn,
model,
example_inputs,
mark="expected",
**experiment_kwargs,
)
if self.args.export_aot_inductor:
@ -2610,7 +2625,11 @@ class BenchmarkRunner:
)
backend_timings = experiment(
model, example_inputs, mark="expected", **experiment_kwargs
self.model_iter_fn,
model,
example_inputs,
mark="expected",
**experiment_kwargs,
)
timings = np.stack((baseline_timings, backend_timings), axis=1)
result_summary = latency_experiment_summary(
@ -2629,9 +2648,17 @@ class BenchmarkRunner:
tag=None,
batch_size=None,
):
niters = 5
if getattr(self, "hf_llm", False):
# If we're benchmarking an llm, we want to use the generate function
self.model_iter_fn = self.generate
niters = 1
if self.args.xla:
with self.pick_grad(name, self.args.training):
return experiment(*self.maybe_cast(model, example_inputs))
return experiment(
self.model_iter_fn, *self.maybe_cast(model, example_inputs)
)
def warmup(fn, model, example_inputs, mode, niters=5):
gc.collect()
@ -2696,17 +2723,22 @@ class BenchmarkRunner:
with maybe_snapshot_memory(
self.args.snapshot_memory, f"eager_{self.args.only}"
):
eager_latency, eager_peak_mem, _ = warmup(
self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
)
if self.args.use_warm_peak_memory:
_, eager_peak_mem, _ = warmup(
with torch.compiler.set_stance("force_eager"):
eager_latency, eager_peak_mem, _ = warmup(
self.model_iter_fn,
copy.deepcopy(model),
example_inputs,
"eager",
niters=1,
niters=niters,
)
if self.args.use_warm_peak_memory:
_, eager_peak_mem, _ = warmup(
self.model_iter_fn,
copy.deepcopy(model),
example_inputs,
"eager",
niters=1,
)
if (
self.args.export_aot_inductor
@ -2715,7 +2747,13 @@ class BenchmarkRunner:
):
optimized_model_iter_fn = optimize_ctx
else:
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
if getattr(self, "hf_llm", False):
# If it's an llm, we want to optimize model.forward, and use
# the generate function
model = optimize_ctx(model)
optimized_model_iter_fn = self.model_iter_fn
else:
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
with maybe_snapshot_memory(
self.args.snapshot_memory, f"compiled_{self.args.only}"
@ -2793,7 +2831,13 @@ class BenchmarkRunner:
f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
)
results.append(experiment(model, example_inputs, **experiment_kwargs))
experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
results.append(
experiment(
self.model_iter_fn, model, example_inputs, **experiment_kwargs
)
)
return " ".join(map(str, results))
def minify_model(
@ -4084,7 +4128,7 @@ def run(runner, args, original_dir=None):
# Overwrite 'translation_validation' config, if specified.
torch.fx.experimental._config.translation_validation = False
experiment = functools.partial(experiment, args, runner.model_iter_fn)
experiment = functools.partial(experiment, args)
if args.only and should_diff_branch(args):
import git

View File

@ -7,6 +7,7 @@ import os
import re
import subprocess
import sys
import types
import warnings
@ -128,6 +129,12 @@ with open(MODELS_FILENAME) as fh:
assert len(BATCH_SIZE_KNOWN_MODELS)
try:
from .huggingface_llm_models import HF_LLM_MODELS
except ImportError:
from huggingface_llm_models import HF_LLM_MODELS
def get_module_cls_by_model_name(model_cls_name):
_module_by_model_name = {
"Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
@ -418,11 +425,8 @@ class HuggingfaceRunner(BenchmarkRunner):
use_eval_mode = self.args.use_eval_mode
dtype = torch.float32
reset_rng_state()
model_cls, config = self._get_model_cls_and_config(model_name)
model = self._download_model(model_name)
model = model.to(device, dtype=dtype)
if self.args.enable_activation_checkpointing:
model.gradient_checkpointing_enable()
# Get batch size
if model_name in BATCH_SIZE_KNOWN_MODELS:
batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
elif batch_size is None:
@ -440,14 +444,46 @@ class HuggingfaceRunner(BenchmarkRunner):
f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}" # noqa: G004
)
example_inputs = generate_inputs_for_model(
model_cls, model, model_name, batch_size, device, include_loss_args=True
)
# Get model and example inputs
if model_name in HF_LLM_MODELS:
benchmark_cls = HF_LLM_MODELS[model_name]
model, example_inputs = benchmark_cls.get_model_and_inputs(
model_name, device
)
# So we can check for correct gradients without eliminating the dropout computation
for attr in dir(config):
if "drop" in attr and isinstance(getattr(config, attr), float):
setattr(config, attr, 1e-30)
# Set this flag so that when we test for speedup, we use
# model.generate instead of using model.forward
self.hf_llm = True
def generate(self, _, example_inputs, collect_outputs=True):
return model.generate(**example_inputs)
self.generate = types.MethodType(generate, self)
else:
self.hf_llm = False
model_cls, config = self._get_model_cls_and_config(model_name)
model = self._download_model(model_name)
model = model.to(device, dtype=dtype)
example_inputs = generate_inputs_for_model(
model_cls, model, model_name, batch_size, device, include_loss_args=True
)
# So we can check for correct gradients without eliminating the dropout computation
for attr in dir(config):
if "drop" in attr and isinstance(getattr(config, attr), float):
setattr(config, attr, 1e-30)
# Turning off kv cache for torchbench models. This is not the right
# thing to do, but the pt2 dashboard is outdated. Real transformers
# benchmarks will be added soon using a different infra.
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
model.config.use_cache = False
if self.args.enable_activation_checkpointing:
model.gradient_checkpointing_enable()
if (
is_training
@ -460,12 +496,6 @@ class HuggingfaceRunner(BenchmarkRunner):
else:
model.eval()
# Turning off kv cache for torchbench models. This is not the right
# thing to do, but the pt2 dashboard is outdated. Real transformers
# benchmarks will be added soon using a different infra.
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
model.config.use_cache = False
self.validate_model(model, example_inputs)
return device, model_name, model, example_inputs, batch_size
@ -530,7 +560,8 @@ class HuggingfaceRunner(BenchmarkRunner):
def forward_pass(self, mod, inputs, collect_outputs=True):
with self.autocast(**self.autocast_arg):
return mod(**inputs)
res = mod(**inputs)
return res.logits if self.hf_llm else res
def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
cloned_inputs = clone_inputs(inputs)

View File

@ -9,9 +9,16 @@ skip:
# Fails with even batch size = 1
- GPTJForCausalLM
- GPTJForQuestionAnswering
# Model too big
- google/gemma-3-4b-it
device:
cpu: []
cpu:
- meta-llama/Llama-3.2-1B
- google/gemma-2-2b
- google/gemma-3-4b-it
- openai/whisper-tiny
- Qwen/Qwen3-0.6B
control_flow:
- AllenaiLongformerBase
@ -67,6 +74,11 @@ batch_size:
XGLMForCausalLM: 4
XLNetLMHeadModel: 2
YituTechConvBert: 2
meta-llama/Llama-3.2-1B: 8
google/gemma-2-2b: 8
google/gemma-3-4b-it: 8
openai/whisper-tiny: 8
Qwen/Qwen3-0.6B: 8
tolerance:

View File

@ -0,0 +1,102 @@
import subprocess
import sys
import torch
def pip_install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
try:
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
WhisperForConditionalGeneration,
WhisperProcessor,
)
except ModuleNotFoundError:
print("Installing HuggingFace Transformers...")
pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
finally:
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
WhisperForConditionalGeneration,
WhisperProcessor,
)
class Benchmark:
@staticmethod
def get_model_and_inputs(model_name, device):
raise NotImplementedError("get_model_and_inputs() not implemented")
class WhisperBenchmark(Benchmark):
SAMPLE_RATE = 16000
DURATION = 30.0 # seconds
@staticmethod
def get_model_and_inputs(model_name, device):
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
model.config.forced_decoder_ids = None
model.generation_config.do_sample = False
model.generation_config.temperature = 0.0
num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
audio = torch.randn(num_samples) * 0.1
inputs = dict(
processor(
audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
)
)
inputs["input_features"] = inputs["input_features"].to(device)
decoder_start_token = model.config.decoder_start_token_id
inputs["decoder_input_ids"] = torch.tensor(
[[decoder_start_token]], device=device
)
return model, inputs
class TextGenerationBenchmark(Benchmark):
INPUT_LENGTH = 1000
OUTPUT_LENGTH = 2000
@staticmethod
def get_model_and_inputs(model_name, device):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
model.eval()
model.generation_config.do_sample = False
model.generation_config.use_cache = True
model.generation_config.cache_implementation = "static"
model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
model.generation_config.pad_token_id = tokenizer.eos_token_id
model.generation_config.temperature = 0.0
vocab_size = tokenizer.vocab_size
input_ids = torch.randint(
low=0,
high=vocab_size,
size=(1, TextGenerationBenchmark.INPUT_LENGTH),
device=device,
dtype=torch.long,
)
example_inputs = {"input_ids": input_ids}
return model, example_inputs
HF_LLM_MODELS: dict[str, Benchmark] = {
"meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
"google/gemma-2-2b": TextGenerationBenchmark,
"google/gemma-3-4b-it": TextGenerationBenchmark,
"openai/whisper-tiny": WhisperBenchmark,
"Qwen/Qwen3-0.6B": TextGenerationBenchmark,
}

View File

@ -46,3 +46,8 @@ TrOCRForCausalLM,64
XGLMForCausalLM,32
XLNetLMHeadModel,16
YituTechConvBert,32
meta-llama/Llama-3.2-1B,8
google/gemma-2-2b,8
google/gemma-3-4b-it,8
openai/whisper-tiny,8
Qwen/Qwen3-0.6B,8