Compare commits

...

9 Commits

Author SHA1 Message Date
ed11cccc84 [no ci] Not fit for CI run
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-25 10:59:23 -07:00
2c6a990d19 [no ci] Spurious change
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-24 20:49:58 -07:00
d8b389269d [no ci] Include all new LLM models
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-24 20:49:10 -07:00
39dd999c34 Merge branch 'main' into prepare-perf-baseline-number-2.8 2025-09-24 20:46:15 -07:00
6e0ee18905 Skip LLM training and increase the number of H100 shard for HF
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-23 04:22:00 -07:00
66c5d4cbc4 Revert "[benchmarks] Add nativert benchmark (#159922)"
This reverts commit 017259f9c65b6fad55fb9597d7077e2543eaae46.
2025-09-19 22:01:45 -07:00
a22515f573 Add a debug
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-19 03:30:27 -07:00
7e16093692 [no ci ] Merge branch 'main' into prepare-perf-baseline-number-2.8 2025-09-19 03:27:42 -07:00
d4456bde3b [no ci] Run TorchInductor benchmark on PyTorch 2.8
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-18 17:18:13 -07:00
5 changed files with 31 additions and 53 deletions

View File

@ -826,6 +826,13 @@ test_dynamo_benchmark() {
local shard_id="$1"
shift
### Perf benchmark 2.8 baseline
pip_uninstall torch torchvision torchaudio torchrec fbgemm-gpu triton pytorch-triton
pip_install torch==2.8.0 torchvision torchaudio torchrec fbgemm-gpu
pip freeze
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then

View File

@ -100,11 +100,12 @@ jobs:
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },

View File

@ -21,7 +21,6 @@ import shutil
import signal
import subprocess
import sys
import tempfile
import time
import weakref
from contextlib import contextmanager
@ -42,7 +41,6 @@ import torch._export
import torch.distributed
import torch.multiprocessing as mp
from torch._C import _has_cuda as HAS_CUDA, _has_xpu as HAS_XPU
from torch._C._nativert import PyModelRunner
from torch._dynamo.profiler import fx_insert_profiling, Profiler
from torch._dynamo.testing import (
dummy_fx_compile,
@ -1101,10 +1099,6 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
frozen_model_iter_fn = export_aot_inductor(
model, example_inputs, args.inductor_compile_mode
)
elif args.export_nativert:
frozen_model_iter_fn = export_nativert(model, example_inputs)
elif args.torchscript_jit_trace:
frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
else:
if kwargs["hf_llm"]:
# If it's an llm, we want to optimize model.forward, and use
@ -1540,16 +1534,6 @@ def export(model, example_inputs):
return opt_export
def export_nativert(model, example_inputs):
optimized = NativeRTCache.load(model, example_inputs)
def opt_nativert(_, example_inputs, collect_outputs=False):
example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
return optimized.run(*example_args, **example_kwargs)
return opt_nativert
def export_aot_inductor(model, example_inputs, mode):
optimized = AOTInductorModelCache.load(model, example_inputs, mode)
@ -2318,12 +2302,7 @@ class BenchmarkRunner:
try:
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
if (
self.args.export
or self.args.export_aot_inductor
or self.args.export_nativert
or self.args.torchscript_jit_trace
):
if self.args.export or self.args.export_aot_inductor:
# apply export on module directly
# no need for n iterations
# the logic should be the same to self.model_iter_fn (forward_pass)
@ -2740,11 +2719,7 @@ class BenchmarkRunner:
niters=1,
)
if (
self.args.export_aot_inductor
or self.args.export_nativert
or self.args.torchscript_jit_trace
):
if self.args.export_aot_inductor:
optimized_model_iter_fn = optimize_ctx
else:
if getattr(self, "hf_llm", False):
@ -3355,7 +3330,7 @@ def parse_args(args=None):
parser.add_argument(
"--timeout",
type=int,
default=2000,
default=3600,
help="timeout (second) for benchmarking.",
)
@ -3509,16 +3484,6 @@ def parse_args(args=None):
action="store_true",
help="Measure pass rate with Export+AOTInductor",
)
group.add_argument(
"--export-nativert",
action="store_true",
help="Measure pass rate with Export+NativeRT",
)
group.add_argument(
"--torchscript-jit-trace",
action="store_true",
help="Measure pass rate with TorchScript jit.trace",
)
group.add_argument(
"--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
)
@ -3952,14 +3917,6 @@ def run(runner, args, original_dir=None):
optimize_ctx = export
experiment = speedup_experiment
output_filename = "export.csv"
elif args.export_nativert:
optimize_ctx = export_nativert
experiment = speedup_experiment
output_filename = "export_nativert.csv"
elif args.torchscript_jit_trace:
optimize_ctx = torchscript_jit_trace
experiment = speedup_experiment
output_filename = "torchscript_jit_trace.csv"
elif args.xla:
(dev,) = args.devices
os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev]

View File

@ -373,6 +373,10 @@ class HuggingfaceRunner(BenchmarkRunner):
def skip_models_due_to_control_flow(self):
return self._skip["control_flow"]
@property
def skip_not_suitable_for_training_models(self):
return self._skip["test"]["training"]
def use_larger_multiplier_for_smaller_tensor(self, name):
return name in [
"ElectraForQuestionAnswering",

View File

@ -9,10 +9,9 @@ skip:
# Fails with even batch size = 1
- GPTJForCausalLM
- GPTJForQuestionAnswering
# Model too big
# Model too big or the benchmark is taking too long (timeout)
- google/gemma-3-4b-it
- openai/gpt-oss-20b
- mistralai/Mistral-7B-Instruct-v0.3
device:
cpu:
@ -27,6 +26,16 @@ skip:
control_flow:
- AllenaiLongformerBase
test:
training:
- meta-llama/Llama-3.2-1B
- google/gemma-2-2b
- google/gemma-3-4b-it
- openai/whisper-tiny
- Qwen/Qwen3-0.6B
- mistralai/Mistral-7B-Instruct-v0.3
- openai/gpt-oss-20b
batch_size:
# TODO - Fails even after fake tensors
divisors: