[no ci] Not fit for CI run

Signed-off-by: Huy Do <huydhn@gmail.com>
[no ci] Spurious change
2025-11-04 08:00:58 +08:00 · 2025-09-25 10:59:23 -07:00 · 2025-09-24 20:49:58 -07:00 · 2025-09-24 20:49:10 -07:00 · 2025-09-24 20:46:15 -07:00 · 2025-09-23 04:22:00 -07:00
5 changed files with 31 additions and 53 deletions
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -826,6 +826,13 @@ test_dynamo_benchmark() {
  local shard_id="$1"
  shift

+
+  ### Perf benchmark 2.8 baseline
+  pip_uninstall torch torchvision torchaudio torchrec fbgemm-gpu triton pytorch-triton
+  pip_install torch==2.8.0 torchvision torchaudio torchrec fbgemm-gpu
+  pip freeze
+
+
  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -100,11 +100,12 @@ jobs:
      cuda-arch-list: '9.0'
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
-          { config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
-          { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
-          { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
-          { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
          { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
          { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
          { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -21,7 +21,6 @@ import shutil
 import signal
 import subprocess
 import sys
-import tempfile
 import time
 import weakref
 from contextlib import contextmanager
@ -42,7 +41,6 @@ import torch._export
 import torch.distributed
 import torch.multiprocessing as mp
 from torch._C import _has_cuda as HAS_CUDA, _has_xpu as HAS_XPU
-from torch._C._nativert import PyModelRunner
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import (
    dummy_fx_compile,
@ -1101,10 +1099,6 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
            frozen_model_iter_fn = export_aot_inductor(
                model, example_inputs, args.inductor_compile_mode
            )
-        elif args.export_nativert:
-            frozen_model_iter_fn = export_nativert(model, example_inputs)
-        elif args.torchscript_jit_trace:
-            frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
        else:
            if kwargs["hf_llm"]:
                # If it's an llm, we want to optimize model.forward, and use
@ -1540,16 +1534,6 @@ def export(model, example_inputs):
    return opt_export


-def export_nativert(model, example_inputs):
-    optimized = NativeRTCache.load(model, example_inputs)
-
-    def opt_nativert(_, example_inputs, collect_outputs=False):
-        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
-        return optimized.run(*example_args, **example_kwargs)
-
-    return opt_nativert
-
-
 def export_aot_inductor(model, example_inputs, mode):
    optimized = AOTInductorModelCache.load(model, example_inputs, mode)

@ -2318,12 +2302,7 @@ class BenchmarkRunner:
            try:
                model_copy = self.deepcopy_and_maybe_parallelize(model)
                self.init_optimizer(name, current_device, model_copy.parameters())
-                if (
-                    self.args.export
-                    or self.args.export_aot_inductor
-                    or self.args.export_nativert
-                    or self.args.torchscript_jit_trace
-                ):
+                if self.args.export or self.args.export_aot_inductor:
                    # apply export on module directly
                    # no need for n iterations
                    # the logic should be the same to self.model_iter_fn (forward_pass)
@ -2740,11 +2719,7 @@ class BenchmarkRunner:
                            niters=1,
                        )

-            if (
-                self.args.export_aot_inductor
-                or self.args.export_nativert
-                or self.args.torchscript_jit_trace
-            ):
+            if self.args.export_aot_inductor:
                optimized_model_iter_fn = optimize_ctx
            else:
                if getattr(self, "hf_llm", False):
@ -3355,7 +3330,7 @@ def parse_args(args=None):
    parser.add_argument(
        "--timeout",
        type=int,
-        default=2000,
+        default=3600,
        help="timeout (second) for benchmarking.",
    )

@ -3509,16 +3484,6 @@ def parse_args(args=None):
        action="store_true",
        help="Measure pass rate with Export+AOTInductor",
    )
-    group.add_argument(
-        "--export-nativert",
-        action="store_true",
-        help="Measure pass rate with Export+NativeRT",
-    )
-    group.add_argument(
-        "--torchscript-jit-trace",
-        action="store_true",
-        help="Measure pass rate with TorchScript jit.trace",
-    )
    group.add_argument(
        "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
    )
@ -3952,14 +3917,6 @@ def run(runner, args, original_dir=None):
        optimize_ctx = export
        experiment = speedup_experiment
        output_filename = "export.csv"
-    elif args.export_nativert:
-        optimize_ctx = export_nativert
-        experiment = speedup_experiment
-        output_filename = "export_nativert.csv"
-    elif args.torchscript_jit_trace:
-        optimize_ctx = torchscript_jit_trace
-        experiment = speedup_experiment
-        output_filename = "torchscript_jit_trace.csv"
    elif args.xla:
        (dev,) = args.devices
        os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev]
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -373,6 +373,10 @@ class HuggingfaceRunner(BenchmarkRunner):
    def skip_models_due_to_control_flow(self):
        return self._skip["control_flow"]

+    @property
+    def skip_not_suitable_for_training_models(self):
+        return self._skip["test"]["training"]
+
    def use_larger_multiplier_for_smaller_tensor(self, name):
        return name in [
            "ElectraForQuestionAnswering",
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -9,10 +9,9 @@ skip:
    # Fails with even batch size = 1
    - GPTJForCausalLM
    - GPTJForQuestionAnswering
-    # Model too big
+    # Model too big or the benchmark is taking too long (timeout)
    - google/gemma-3-4b-it
    - openai/gpt-oss-20b
-    - mistralai/Mistral-7B-Instruct-v0.3

  device:
    cpu:
@ -27,6 +26,16 @@ skip:
  control_flow:
    - AllenaiLongformerBase

+  test:
+    training:
+      - meta-llama/Llama-3.2-1B
+      - google/gemma-2-2b
+      - google/gemma-3-4b-it
+      - openai/whisper-tiny
+      - Qwen/Qwen3-0.6B
+      - mistralai/Mistral-7B-Instruct-v0.3
+      - openai/gpt-oss-20b
+
 batch_size:
  # TODO - Fails even after fake tensors
  divisors:
Author	SHA1	Message	Date
Huy Do	ed11cccc84	[no ci] Not fit for CI run Signed-off-by: Huy Do <huydhn@gmail.com>	2025-09-25 10:59:23 -07:00
Huy Do	2c6a990d19	[no ci] Spurious change Signed-off-by: Huy Do <huydhn@gmail.com>	2025-09-24 20:49:58 -07:00
Huy Do	d8b389269d	[no ci] Include all new LLM models Signed-off-by: Huy Do <huydhn@gmail.com>	2025-09-24 20:49:10 -07:00
Huy Do	39dd999c34	Merge branch 'main' into prepare-perf-baseline-number-2.8	2025-09-24 20:46:15 -07:00
Huy Do	6e0ee18905	Skip LLM training and increase the number of H100 shard for HF Signed-off-by: Huy Do <huydhn@gmail.com>	2025-09-23 04:22:00 -07:00
Huy Do	66c5d4cbc4	Revert "[benchmarks] Add nativert benchmark (#159922 )" This reverts commit 017259f9c65b6fad55fb9597d7077e2543eaae46.	2025-09-19 22:01:45 -07:00
Huy Do	a22515f573	Add a debug Signed-off-by: Huy Do <huydhn@gmail.com>	2025-09-19 03:30:27 -07:00
Huy Do	7e16093692	[no ci ] Merge branch 'main' into prepare-perf-baseline-number-2.8	2025-09-19 03:27:42 -07:00
Huy Do	d4456bde3b	[no ci] Run TorchInductor benchmark on PyTorch 2.8 Signed-off-by: Huy Do <huydhn@gmail.com>	2025-09-18 17:18:13 -07:00