From f7c33abab3a6233a51d7d4fd116625be14df68ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Ouazan?=
 <83456801+remi-or@users.noreply.github.com>
Date: Thu, 16 Oct 2025 17:25:49 +0200
Subject: [PATCH] Small changes to benchmarking script (#41662)

---
 benchmark_v2/framework/benchmark_config.py | 33 ++++++------
 benchmark_v2/framework/benchmark_runner.py | 21 ++++----
 benchmark_v2/framework/data_classes.py     | 50 ++++++++++--------
 benchmark_v2/run_benchmarks.py             | 61 ++++++++++++----------
 4 files changed, 88 insertions(+), 77 deletions(-)

diff --git a/benchmark_v2/framework/benchmark_config.py b/benchmark_v2/framework/benchmark_config.py
index 3e3e37171ae..28f5824ea6a 100644
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@@ -104,7 +104,7 @@ class BenchmarkConfig:
             "attn_implementation": self.attn_implementation,
             "sdpa_backend": self.sdpa_backend,
             "compile_mode": self.compile_mode,
-            "compile_options": self.compile_options,
+            "compile_options": self.compile_options | {},  # to avoid inplace modification of the original dict
             "kernelize": self.kernelize,
         }
 
@@ -191,7 +191,7 @@ def generate_all_configs(
     )
 
 
-def generate_default_configs(
+def generate_main_configs(
     warmup_iterations: int = 5,
     measurement_iterations: int = 20,
     batch_size: int = 1,
@@ -199,20 +199,17 @@ def generate_default_configs(
     num_tokens_to_generate: int = 128,
     gpu_monitoring: bool = False,
 ) -> list[BenchmarkConfig]:
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),  # note: this one can fail with compile because of attn mask
+    # Create kwargs common to all configs
+    kwargs = {
+        "warmup_iterations": warmup_iterations,
+        "measurement_iterations": measurement_iterations,
+        "batch_size": batch_size,
+        "sequence_length": sequence_length,
+        "num_tokens_to_generate": num_tokens_to_generate,
+        "gpu_monitoring": gpu_monitoring,
+    }
+    return [  # TODO: test max-autotune instead of default
+        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", **kwargs),
+        BenchmarkConfig(attn_implementation="eager", compile_mode="default", **kwargs),
+        BenchmarkConfig(attn_implementation="flash_attention_2", **kwargs),
     ]
-    return cross_generate_configs(
-        attn_impl_and_sdpa_backend=all_attn_implementations,
-        compiled_mode=[None, "max-autotune"],
-        kernelized=[False, KERNELIZATION_AVAILABLE],
-        warmup_iterations=warmup_iterations,
-        measurement_iterations=measurement_iterations,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_tokens_to_generate=num_tokens_to_generate,
-        gpu_monitoring=gpu_monitoring,
-    )
diff --git a/benchmark_v2/framework/benchmark_runner.py b/benchmark_v2/framework/benchmark_runner.py
index a6ec16aecb6..913533b9d50 100644
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@@ -144,11 +144,11 @@ class BenchmarkStreamer(BaseStreamer):
 class BenchmarkRunner:
     """Main benchmark runner that coordinates benchmark execution."""
 
-    def __init__(
-        self, logger: logging.Logger, output_dir: str = "benchmark_results", commit_id: str | None = None
-    ) -> None:
+    def __init__(self, logger: logging.Logger, output_dir: str | None = None, commit_id: str | None = None) -> None:
         # Those stay constant for the whole run
         self.logger = logger
+        if output_dir is None:
+            output_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "benchmark_results")
         self.output_dir = output_dir
         self.commit_id = get_git_revision() if commit_id is None else commit_id
         os.makedirs(self.output_dir, exist_ok=True)
@@ -214,7 +214,7 @@ class BenchmarkRunner:
 
             # Quick validation: try one measurement first to see if this scenario works
             flush_memory()
-            e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
+            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                 max_new_tokens=1, gpu_monitor=None
             )
             if e2e_latency < 0:
@@ -231,11 +231,11 @@ class BenchmarkRunner:
             result = BenchmarkResult()
             self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
             for _ in trange(config.measurement_iterations):
-                e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
+                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                     max_new_tokens=config.num_tokens_to_generate,
                     gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                 )
-                result.accumulate(e2e_latency, token_generation_times, decoded_output, gpu_metrics)
+                result.accumulate(e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics)
             self.logger.info("Benchmarking done. Cleaning up.")
 
             # Profile if needed
@@ -277,10 +277,11 @@ class BenchmarkRunner:
             raise RuntimeError(f"Generated {new_tokens} tokens, expected {max_new_tokens}")
         # Decode outputs
         decoded_output = self.tokenizer.decode(outputs[0, input_tokens:], skip_special_tokens=True)
+        shape_and_decoded_output = f"{tuple(outputs.shape)} | {decoded_output}"
         # Compute intermediate quantities
         e2e_latency = wall_time_1 - wall_time_0
         token_generation_times = [t - wall_time_0 for t in streamer.timestamps[1:]]
-        return e2e_latency, token_generation_times, decoded_output, gpu_metrics
+        return e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics
 
     def profile_generate(self, num_tokens_to_profile: int, config_name: str) -> None:
         """Profile the latency of a call to model.generate() with the given (inputs) and (max_new_tokens)."""
@@ -351,10 +352,10 @@ class BenchmarkRunner:
                 first_metadata = all_results[first_key]["metadata"].to_dict()
                 hardware_info = first_metadata.pop("hardware_info")
                 pretty_print_dict(first_metadata | hardware_info, tabs=1)
-            for value in all_results.values():
+            for result in all_results.values():
                 print("=" * 100)
-                print(f"Config: {value['config'].infer_name(compact=False)}\n")
-                value["measurements"].pprint(tabs=1)
+                print(f"Config: {result['config'].infer_name(compact=False)}\n")
+                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
             print("=" * 100)
 
         return all_results
diff --git a/benchmark_v2/framework/data_classes.py b/benchmark_v2/framework/data_classes.py
index f5e740d97b5..149b3d9c91a 100644
--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@@ -82,19 +82,19 @@ class BenchmarkResult:
     def __init__(self) -> None:
         self.e2e_latency = []
         self.token_generation_times = []  # time at which each token was generated (relative to start of the generation)
-        self.decoded_outputs = []
+        self.shape_and_decoded_outputs = []
         self.gpu_metrics = []
 
     def accumulate(
         self,
         e2e_latency: float,
         token_generation_times: list[float],
-        decoded_output: str,
+        shape_and_decoded_output: str,
         gpu_metrics: GPURawMetrics | None,
     ) -> None:
         self.e2e_latency.append(e2e_latency)
         self.token_generation_times.append(token_generation_times)
-        self.decoded_outputs.append(decoded_output)
+        self.shape_and_decoded_outputs.append(shape_and_decoded_output)
         self.gpu_metrics.append(gpu_metrics)
 
     def to_dict(self) -> dict[str, None | int | float]:
@@ -106,7 +106,7 @@ class BenchmarkResult:
         return {
             "e2e_latency": self.e2e_latency,
             "token_generation_times": self.token_generation_times,
-            "decoded_outputs": self.decoded_outputs,
+            "shape_and_decoded_outputs": self.shape_and_decoded_outputs,
             "gpu_metrics": gpu_metrics,
         }
 
@@ -123,7 +123,7 @@ class BenchmarkResult:
             new_instance.accumulate(
                 e2e_latency=data["e2e_latency"][i],
                 token_generation_times=data["token_generation_times"][i],
-                decoded_output=data["decoded_output"][i],
+                shape_and_decoded_output=data["shape_and_decoded_outputs"][i],
                 gpu_metrics=gpu_metrics[i],
             )
         return new_instance
@@ -134,19 +134,27 @@ class BenchmarkResult:
     def get_measured_itl(self) -> list[float]:
         return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]
 
-    def pprint(self, tabs: int = 0) -> None:
-        collated_stats = equalize_lengths_and_collate(
-            [
-                add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
-                add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
-                add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
-            ]
-        )
-        pretty_print_dict(
-            {
-                "E2E Latency": collated_stats[0],
-                "Time to First Token": collated_stats[1],
-                "Inter-Token Latency": collated_stats[2],
-            },
-            tabs=tabs,
-        )
+    def get_throughput(self, batch_size: int) -> float:
+        return [
+            batch_size * len(dt) / e2e_latency
+            for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
+        ]
+
+    def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
+        stats_to_collate = [
+            add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
+        ]
+        if batch_size > 0:
+            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
+            stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
+        collated_stats = equalize_lengths_and_collate(stats_to_collate)
+        dict_to_pprint = {
+            "E2E Latency": collated_stats[0],
+            "Time to First Token": collated_stats[1],
+            "Inter-Token Latency": collated_stats[2],
+        }
+        if batch_size > 0:
+            dict_to_pprint["Throughput"] = collated_stats[3]
+        pretty_print_dict(dict_to_pprint, tabs=tabs)
diff --git a/benchmark_v2/run_benchmarks.py b/benchmark_v2/run_benchmarks.py
index 85fb5a9493f..ea811b423ef 100755
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@@ -20,28 +20,28 @@ in the ./benches directory, organizing outputs into model-specific subfolders.
 
 import argparse
 import logging
-import random
 import sys
 import uuid
 
-from framework.benchmark_config import BenchmarkConfig, generate_all_configs
+from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
 from framework.benchmark_runner import BenchmarkRunner
 
 
 if __name__ == "__main__":
     # Parse arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument("--output-dir", type=str, default="benchmark_results", help="Output dir for benchmark results")
+    parser.add_argument("--output-dir", type=str, default=None, help="Output dir for benchmark results")
     parser.add_argument("--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO")
     parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
 
-    parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations")
-    parser.add_argument("--iterations", type=int, default=20, help="Number of measurement iterations")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations")
+    parser.add_argument("--iterations", type=int, default=10, help="Number of measurement iterations")
 
     parser.add_argument("--batch-size", "-b", type=int, nargs="+", help="Batch size")
     parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
     parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")
 
+    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
     parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")
 
     parser.add_argument("--commit-id", type=str, help="Git commit ID (if not provided, will auto-detect from git)")
@@ -69,42 +69,47 @@ if __name__ == "__main__":
 
     # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
     elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
-        benchmark_configs = generate_all_configs(
+        if args.cross_generate:
+            benchmark_configs = generate_all_configs(
+                warmup_iterations=args.warmup,
+                measurement_iterations=args.iterations,
+                batch_size=args.batch_size[0],
+                sequence_length=args.sequence_length[0],
+                num_tokens_to_generate=args.num_tokens_to_generate[0],
+            )
+        else:
+            benchmark_configs = generate_main_configs(
+                warmup_iterations=args.warmup,
+                measurement_iterations=args.iterations,
+                batch_size=args.batch_size[0],
+                sequence_length=args.sequence_length[0],
+                num_tokens_to_generate=args.num_tokens_to_generate[0],
+            )
+
+    # Otherwise, we benchmark across all combinations of dimensions
+    else:
+        main_config = generate_main_configs(
             warmup_iterations=args.warmup,
             measurement_iterations=args.iterations,
             batch_size=args.batch_size[0],
             sequence_length=args.sequence_length[0],
             num_tokens_to_generate=args.num_tokens_to_generate[0],
-        )
-        random.shuffle(benchmark_configs)
-
-    # Otherwise, we benchmark across all combinations of dimensions
-    else:
-        kwargs = {
-            "warmup_iterations": args.warmup,
-            "measurement_iterations": args.iterations,
-            "gpu_monitoring": False,
-            "batch_size": args.batch_size[0],
-            "sequence_length": args.sequence_length[0],
-            "num_tokens_to_generate": args.num_tokens_to_generate[0],
-            "attn_implementation": "flex_attention",
-            "sdpa_backend": None,
-            "compile_mode": "default",
-            "kernelize": False,
-        }
+        )[0]
         benchmark_configs = []
         for num_tokens_to_generate in args.num_tokens_to_generate:
             for sequence_length in args.sequence_length:
                 for batch_size in args.batch_size:
-                    kwargs["batch_size"] = batch_size
-                    kwargs["sequence_length"] = sequence_length
-                    kwargs["num_tokens_to_generate"] = num_tokens_to_generate
-                    benchmark_configs.append(BenchmarkConfig(**kwargs))
+                    cfg_dict = main_config.to_dict()
+                    cfg_dict["batch_size"] = batch_size
+                    cfg_dict["sequence_length"] = sequence_length
+                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
+                    cfg_dict.pop("name")
+                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
 
     runner = BenchmarkRunner(logger, args.output_dir, args.commit_id)
     results = runner.run_benchmarks(
         args.model_id,
-        benchmark_configs[:3],
+        benchmark_configs,
         args.num_tokens_to_profile,
         pretty_print_summary=True,
     )