mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[benchmark] Add HF LLM benchmarks (#156967)
Results in https://docs.google.com/spreadsheets/d/1xXOPg9JjEmPx0zc5QBNdyXQq8-K2_r4ybHaiS-q7pZ0/edit?gid=88695043#gid=88695043 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156967 Approved by: https://github.com/huydhn Co-authored-by: Huy Do <huydhn@gmail.com>
This commit is contained in:
committed by
PyTorch MergeBot
parent
84186c39ed
commit
972140b7e9
@ -72,6 +72,12 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
"timm_vovnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -55,6 +55,12 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
|
||||
"timm_nfnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
|
@ -167,3 +167,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
|
@ -733,7 +733,7 @@ def timed(
|
||||
|
||||
time_total = 0
|
||||
# Dont collect outputs to correctly measure timing
|
||||
for _ in range(times):
|
||||
for i in range(times):
|
||||
# If batch_size is 1, it too often collides with other non batch size
|
||||
# dimensions resulting in errors.
|
||||
if batch_size and batch_size > 1:
|
||||
@ -1106,7 +1106,13 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
||||
elif args.torchscript_jit_trace:
|
||||
frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
|
||||
else:
|
||||
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
||||
if kwargs["hf_llm"]:
|
||||
# If it's an llm, we want to optimize model.forward, and use
|
||||
# the generate function
|
||||
model.forward = torch._dynamo.run(model)
|
||||
frozen_model_iter_fn = model_iter_fn
|
||||
else:
|
||||
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
||||
|
||||
for rep in trange(args.repeat, desc="running benchmark"):
|
||||
inputs = (
|
||||
@ -1120,7 +1126,10 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
||||
maybe_mark_step(args)
|
||||
|
||||
# interleave the runs to handle frequency scaling and load changes
|
||||
with maybe_mark_profile(p=p, mark="expected"):
|
||||
with (
|
||||
maybe_mark_profile(p=p, mark="expected"),
|
||||
torch.compiler.set_stance("force_eager"),
|
||||
):
|
||||
timings[rep, 0], expected_output = timed(
|
||||
model,
|
||||
model_iter_fn,
|
||||
@ -2233,11 +2242,12 @@ class BenchmarkRunner:
|
||||
reset_rng_state()
|
||||
model_copy = None
|
||||
try:
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
"eager_1st_run_OOM"
|
||||
@ -2254,11 +2264,12 @@ class BenchmarkRunner:
|
||||
reset_rng_state()
|
||||
model_copy = None
|
||||
try:
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_rerun_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_rerun_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
"eager_2nd_run_OOM"
|
||||
@ -2542,7 +2553,11 @@ class BenchmarkRunner:
|
||||
)
|
||||
|
||||
baseline_timings = experiment(
|
||||
model, example_inputs, mark="expected", **experiment_kwargs
|
||||
self.model_iter_fn,
|
||||
model,
|
||||
example_inputs,
|
||||
mark="expected",
|
||||
**experiment_kwargs,
|
||||
)
|
||||
|
||||
if self.args.export_aot_inductor:
|
||||
@ -2610,7 +2625,11 @@ class BenchmarkRunner:
|
||||
)
|
||||
|
||||
backend_timings = experiment(
|
||||
model, example_inputs, mark="expected", **experiment_kwargs
|
||||
self.model_iter_fn,
|
||||
model,
|
||||
example_inputs,
|
||||
mark="expected",
|
||||
**experiment_kwargs,
|
||||
)
|
||||
timings = np.stack((baseline_timings, backend_timings), axis=1)
|
||||
result_summary = latency_experiment_summary(
|
||||
@ -2629,9 +2648,17 @@ class BenchmarkRunner:
|
||||
tag=None,
|
||||
batch_size=None,
|
||||
):
|
||||
niters = 5
|
||||
if getattr(self, "hf_llm", False):
|
||||
# If we're benchmarking an llm, we want to use the generate function
|
||||
self.model_iter_fn = self.generate
|
||||
niters = 1
|
||||
|
||||
if self.args.xla:
|
||||
with self.pick_grad(name, self.args.training):
|
||||
return experiment(*self.maybe_cast(model, example_inputs))
|
||||
return experiment(
|
||||
self.model_iter_fn, *self.maybe_cast(model, example_inputs)
|
||||
)
|
||||
|
||||
def warmup(fn, model, example_inputs, mode, niters=5):
|
||||
gc.collect()
|
||||
@ -2696,17 +2723,22 @@ class BenchmarkRunner:
|
||||
with maybe_snapshot_memory(
|
||||
self.args.snapshot_memory, f"eager_{self.args.only}"
|
||||
):
|
||||
eager_latency, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
|
||||
)
|
||||
if self.args.use_warm_peak_memory:
|
||||
_, eager_peak_mem, _ = warmup(
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
eager_latency, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn,
|
||||
copy.deepcopy(model),
|
||||
example_inputs,
|
||||
"eager",
|
||||
niters=1,
|
||||
niters=niters,
|
||||
)
|
||||
if self.args.use_warm_peak_memory:
|
||||
_, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn,
|
||||
copy.deepcopy(model),
|
||||
example_inputs,
|
||||
"eager",
|
||||
niters=1,
|
||||
)
|
||||
|
||||
if (
|
||||
self.args.export_aot_inductor
|
||||
@ -2715,7 +2747,13 @@ class BenchmarkRunner:
|
||||
):
|
||||
optimized_model_iter_fn = optimize_ctx
|
||||
else:
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
if getattr(self, "hf_llm", False):
|
||||
# If it's an llm, we want to optimize model.forward, and use
|
||||
# the generate function
|
||||
model = optimize_ctx(model)
|
||||
optimized_model_iter_fn = self.model_iter_fn
|
||||
else:
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
|
||||
with maybe_snapshot_memory(
|
||||
self.args.snapshot_memory, f"compiled_{self.args.only}"
|
||||
@ -2793,7 +2831,13 @@ class BenchmarkRunner:
|
||||
f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
|
||||
)
|
||||
|
||||
results.append(experiment(model, example_inputs, **experiment_kwargs))
|
||||
experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
|
||||
|
||||
results.append(
|
||||
experiment(
|
||||
self.model_iter_fn, model, example_inputs, **experiment_kwargs
|
||||
)
|
||||
)
|
||||
return " ".join(map(str, results))
|
||||
|
||||
def minify_model(
|
||||
@ -4084,7 +4128,7 @@ def run(runner, args, original_dir=None):
|
||||
# Overwrite 'translation_validation' config, if specified.
|
||||
torch.fx.experimental._config.translation_validation = False
|
||||
|
||||
experiment = functools.partial(experiment, args, runner.model_iter_fn)
|
||||
experiment = functools.partial(experiment, args)
|
||||
|
||||
if args.only and should_diff_branch(args):
|
||||
import git
|
||||
|
@ -7,6 +7,7 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import types
|
||||
import warnings
|
||||
|
||||
|
||||
@ -128,6 +129,12 @@ with open(MODELS_FILENAME) as fh:
|
||||
assert len(BATCH_SIZE_KNOWN_MODELS)
|
||||
|
||||
|
||||
try:
|
||||
from .huggingface_llm_models import HF_LLM_MODELS
|
||||
except ImportError:
|
||||
from huggingface_llm_models import HF_LLM_MODELS
|
||||
|
||||
|
||||
def get_module_cls_by_model_name(model_cls_name):
|
||||
_module_by_model_name = {
|
||||
"Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
|
||||
@ -418,11 +425,8 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
use_eval_mode = self.args.use_eval_mode
|
||||
dtype = torch.float32
|
||||
reset_rng_state()
|
||||
model_cls, config = self._get_model_cls_and_config(model_name)
|
||||
model = self._download_model(model_name)
|
||||
model = model.to(device, dtype=dtype)
|
||||
if self.args.enable_activation_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
# Get batch size
|
||||
if model_name in BATCH_SIZE_KNOWN_MODELS:
|
||||
batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
|
||||
elif batch_size is None:
|
||||
@ -440,14 +444,46 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}" # noqa: G004
|
||||
)
|
||||
|
||||
example_inputs = generate_inputs_for_model(
|
||||
model_cls, model, model_name, batch_size, device, include_loss_args=True
|
||||
)
|
||||
# Get model and example inputs
|
||||
if model_name in HF_LLM_MODELS:
|
||||
benchmark_cls = HF_LLM_MODELS[model_name]
|
||||
model, example_inputs = benchmark_cls.get_model_and_inputs(
|
||||
model_name, device
|
||||
)
|
||||
|
||||
# So we can check for correct gradients without eliminating the dropout computation
|
||||
for attr in dir(config):
|
||||
if "drop" in attr and isinstance(getattr(config, attr), float):
|
||||
setattr(config, attr, 1e-30)
|
||||
# Set this flag so that when we test for speedup, we use
|
||||
# model.generate instead of using model.forward
|
||||
self.hf_llm = True
|
||||
|
||||
def generate(self, _, example_inputs, collect_outputs=True):
|
||||
return model.generate(**example_inputs)
|
||||
|
||||
self.generate = types.MethodType(generate, self)
|
||||
|
||||
else:
|
||||
self.hf_llm = False
|
||||
|
||||
model_cls, config = self._get_model_cls_and_config(model_name)
|
||||
model = self._download_model(model_name)
|
||||
model = model.to(device, dtype=dtype)
|
||||
|
||||
example_inputs = generate_inputs_for_model(
|
||||
model_cls, model, model_name, batch_size, device, include_loss_args=True
|
||||
)
|
||||
|
||||
# So we can check for correct gradients without eliminating the dropout computation
|
||||
for attr in dir(config):
|
||||
if "drop" in attr and isinstance(getattr(config, attr), float):
|
||||
setattr(config, attr, 1e-30)
|
||||
|
||||
# Turning off kv cache for torchbench models. This is not the right
|
||||
# thing to do, but the pt2 dashboard is outdated. Real transformers
|
||||
# benchmarks will be added soon using a different infra.
|
||||
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
|
||||
model.config.use_cache = False
|
||||
|
||||
if self.args.enable_activation_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
if (
|
||||
is_training
|
||||
@ -460,12 +496,6 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
else:
|
||||
model.eval()
|
||||
|
||||
# Turning off kv cache for torchbench models. This is not the right
|
||||
# thing to do, but the pt2 dashboard is outdated. Real transformers
|
||||
# benchmarks will be added soon using a different infra.
|
||||
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
|
||||
model.config.use_cache = False
|
||||
|
||||
self.validate_model(model, example_inputs)
|
||||
return device, model_name, model, example_inputs, batch_size
|
||||
|
||||
@ -530,7 +560,8 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
|
||||
def forward_pass(self, mod, inputs, collect_outputs=True):
|
||||
with self.autocast(**self.autocast_arg):
|
||||
return mod(**inputs)
|
||||
res = mod(**inputs)
|
||||
return res.logits if self.hf_llm else res
|
||||
|
||||
def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
|
||||
cloned_inputs = clone_inputs(inputs)
|
||||
|
@ -9,9 +9,16 @@ skip:
|
||||
# Fails with even batch size = 1
|
||||
- GPTJForCausalLM
|
||||
- GPTJForQuestionAnswering
|
||||
# Model too big
|
||||
- google/gemma-3-4b-it
|
||||
|
||||
device:
|
||||
cpu: []
|
||||
cpu:
|
||||
- meta-llama/Llama-3.2-1B
|
||||
- google/gemma-2-2b
|
||||
- google/gemma-3-4b-it
|
||||
- openai/whisper-tiny
|
||||
- Qwen/Qwen3-0.6B
|
||||
|
||||
control_flow:
|
||||
- AllenaiLongformerBase
|
||||
@ -67,6 +74,11 @@ batch_size:
|
||||
XGLMForCausalLM: 4
|
||||
XLNetLMHeadModel: 2
|
||||
YituTechConvBert: 2
|
||||
meta-llama/Llama-3.2-1B: 8
|
||||
google/gemma-2-2b: 8
|
||||
google/gemma-3-4b-it: 8
|
||||
openai/whisper-tiny: 8
|
||||
Qwen/Qwen3-0.6B: 8
|
||||
|
||||
|
||||
tolerance:
|
||||
|
102
benchmarks/dynamo/huggingface_llm_models.py
Normal file
102
benchmarks/dynamo/huggingface_llm_models.py
Normal file
@ -0,0 +1,102 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def pip_install(package):
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
||||
|
||||
|
||||
try:
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
except ModuleNotFoundError:
|
||||
print("Installing HuggingFace Transformers...")
|
||||
pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
|
||||
finally:
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
|
||||
|
||||
class Benchmark:
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
raise NotImplementedError("get_model_and_inputs() not implemented")
|
||||
|
||||
|
||||
class WhisperBenchmark(Benchmark):
|
||||
SAMPLE_RATE = 16000
|
||||
DURATION = 30.0 # seconds
|
||||
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
processor = WhisperProcessor.from_pretrained(model_name)
|
||||
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
|
||||
model.config.forced_decoder_ids = None
|
||||
|
||||
model.generation_config.do_sample = False
|
||||
model.generation_config.temperature = 0.0
|
||||
|
||||
num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
|
||||
audio = torch.randn(num_samples) * 0.1
|
||||
inputs = dict(
|
||||
processor(
|
||||
audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
|
||||
)
|
||||
)
|
||||
inputs["input_features"] = inputs["input_features"].to(device)
|
||||
|
||||
decoder_start_token = model.config.decoder_start_token_id
|
||||
inputs["decoder_input_ids"] = torch.tensor(
|
||||
[[decoder_start_token]], device=device
|
||||
)
|
||||
|
||||
return model, inputs
|
||||
|
||||
|
||||
class TextGenerationBenchmark(Benchmark):
|
||||
INPUT_LENGTH = 1000
|
||||
OUTPUT_LENGTH = 2000
|
||||
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
|
||||
model.eval()
|
||||
|
||||
model.generation_config.do_sample = False
|
||||
model.generation_config.use_cache = True
|
||||
model.generation_config.cache_implementation = "static"
|
||||
model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id
|
||||
model.generation_config.temperature = 0.0
|
||||
|
||||
vocab_size = tokenizer.vocab_size
|
||||
input_ids = torch.randint(
|
||||
low=0,
|
||||
high=vocab_size,
|
||||
size=(1, TextGenerationBenchmark.INPUT_LENGTH),
|
||||
device=device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
example_inputs = {"input_ids": input_ids}
|
||||
|
||||
return model, example_inputs
|
||||
|
||||
|
||||
HF_LLM_MODELS: dict[str, Benchmark] = {
|
||||
"meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
|
||||
"google/gemma-2-2b": TextGenerationBenchmark,
|
||||
"google/gemma-3-4b-it": TextGenerationBenchmark,
|
||||
"openai/whisper-tiny": WhisperBenchmark,
|
||||
"Qwen/Qwen3-0.6B": TextGenerationBenchmark,
|
||||
}
|
@ -46,3 +46,8 @@ TrOCRForCausalLM,64
|
||||
XGLMForCausalLM,32
|
||||
XLNetLMHeadModel,16
|
||||
YituTechConvBert,32
|
||||
meta-llama/Llama-3.2-1B,8
|
||||
google/gemma-2-2b,8
|
||||
google/gemma-3-4b-it,8
|
||||
openai/whisper-tiny,8
|
||||
Qwen/Qwen3-0.6B,8
|
||||
|
Reference in New Issue
Block a user