add initial benchmark

2025-11-16 15:34:57 +08:00 · 2025-06-25 22:58:27 -07:00
5 changed files with 456 additions and 0 deletions
--- a/benchmarks/llm_benchmarks/benchmark.py
+++ b/benchmarks/llm_benchmarks/benchmark.py
@ -0,0 +1,131 @@
+import argparse
+import csv
+import dataclasses
+import gc
+import itertools
+import json
+import os
+
+from common import Experiment
+from generate import test_configs
+from model_zoo import models
+
+
+DEFAULT_OUTPUT_FILE = "gpt_fast_benchmark.csv"
+
+
+def output_csv(output_file, headers, row):
+    if os.path.exists(output_file):
+        with open(output_file) as fd:
+            lines = list(csv.reader(fd)) or [[]]
+            if headers and len(headers) > len(lines[0]):
+                # if prior results failed the header might not be filled in yet
+                lines[0] = headers
+            else:
+                headers = lines[0]
+    else:
+        lines = [headers]
+
+    if output_file != DEFAULT_OUTPUT_FILE:
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    lines.append([(f"{x:.6f}" if isinstance(x, float) else x) for x in row])
+    with open(output_file, "w") as fd:
+        writer = csv.writer(fd, lineterminator="\n")
+        for line in lines:
+            writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))
+
+
+def output_json(output_file, headers, row):
+    """
+    Write the result into JSON format, so that it can be uploaded to the benchmark database
+    to be displayed on OSS dashboard. The JSON format is defined at
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    mapping_headers = {headers[i]: v for i, v in enumerate(row)}
+    record = {
+        "benchmark": {
+            "name": "PyTorch LLM benchmark",
+            "mode": "inference",
+            "dtype": mapping_headers["dtype"],
+            "extra_info": {
+                "device": mapping_headers["device"],
+                "arch": mapping_headers["arch"],
+            },
+        },
+        "model": {
+            "name": mapping_headers["name"],
+            "origins": ["pytorch"],
+        },
+        "metric": {
+            "compilation_time": mapping_headers["compilation_time"],
+            "tokens_per_second": [mapping_headers["tokens_per_second"]],
+            "memory_bandwidth": mapping_headers["memory_bandwidth"],
+        },
+    }
+
+    with open(f"{os.path.splitext(output_file)[0]}.json", "a") as f:
+        print(json.dumps(record), file=f)
+
+
+def main(args):
+    results = []
+
+    if args.only:
+        if args.only not in models:
+            raise ValueError(f"Unknown model: {args.only}")
+        experiments = [(args.only, models[args.only])]
+    else:
+        experiments = [
+            (model_name, benchmark_class)
+            for model_name, benchmark_class in models.items()
+        ]
+
+    configs_to_test = [args.test_config] if args.test_config else list(test_configs)
+
+    devices = [args.device] if args.device else ["cuda", "cpu"]
+
+    results = []
+    for model_name, benchmark_class in experiments:
+        for test_config, device in itertools.product(configs_to_test, devices):
+            print("Processing:", model_name, test_config, device)
+            benchmark = benchmark_class(model_name, device, test_config)
+            res = benchmark.run_inference()
+            print("Results:", dataclasses.astuple(res))
+            results.append(dataclasses.astuple(res))
+
+            # Clean up the memory to avoid OOM
+            del benchmark
+            gc.collect()
+
+    headers = [field.name for field in dataclasses.fields(Experiment)]
+
+    for row in results:
+        output_csv(args.output_file, headers, row)
+        output_json(args.output_file, headers, row)
+
+    print(f"Results saved to {args.output_file}")
+    print(f"Results saved to {os.path.splitext(output_file)[0]}.json")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run experiments.")
+    parser.add_argument(
+        "--output-file",
+        default=DEFAULT_OUTPUT_FILE,
+        help="Set the output CSV file to save the benchmark results",
+    )
+    parser.add_argument(
+        "--only",
+        help="Specify a model to run exclusively",
+    )
+    parser.add_argument(
+        "--device", help="Specify the device to use", choices=["cuda", "cpu"]
+    )
+    parser.add_argument(
+        "--test-config",
+        help="Specify the test config to use",
+        choices=test_configs,
+    )
+    args = parser.parse_args()
+
+    main(args)
--- a/benchmarks/llm_benchmarks/common.py
+++ b/benchmarks/llm_benchmarks/common.py
@ -0,0 +1,58 @@
+import itertools
+import platform
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+
+torch.manual_seed(1234)
+
+
+def get_arch_name() -> str:
+    if torch.cuda.is_available():
+        return torch.cuda.get_device_name()
+    else:
+        # This returns x86_64 or arm64 (for aarch64)
+        return platform.machine()
+
+
+@dataclass
+class Experiment:
+    name: str
+    dtype: str
+    device: str
+    arch: str
+    test_config: str
+    compilation_time: int
+    tokens_per_second: Optional[int] = None
+    memory_bandwidth: Optional[int] = None
+    real_time_factor: Optional[float] = None
+
+
+N_ITER = 10
+batch_size_combinations = [1, 4]
+max_new_token_combinations = [16, 256]
+cache_implementation_combinations = ["hybrid", "static"]
+
+
+def device_sync(device):
+    if "cuda" in device:
+        torch.cuda.synchronize(device)
+    elif "cpu" in device:
+        pass
+    else:
+        print(f"device={device} is not yet suppported")
+
+
+# Only count activated parameters and buffers.
+def _get_model_size(model):
+    model_size = 0
+    for name, child in model.named_children():
+        if not isinstance(child, torch.nn.Embedding):
+            model_size += sum(
+                p.numel() * p.dtype.itemsize
+                for p in itertools.chain(child.parameters(), child.buffers())
+            )
+
+    return model_size
--- a/benchmarks/llm_benchmarks/generate.py
+++ b/benchmarks/llm_benchmarks/generate.py
@ -0,0 +1,188 @@
+import copy
+import time
+from contextlib import nullcontext
+
+from common import (
+    _get_model_size,
+    batch_size_combinations,
+    device_sync,
+    Experiment,
+    get_arch_name,
+    max_new_token_combinations,
+    N_ITER,
+)
+from datasets import load_dataset
+from prompts import FRANCE_ARTICLE
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+)
+
+import torch
+
+
+class Benchmark:
+    def __init__(self, model_name, device, test_config):
+        self.device = device
+        self.model_name = model_name
+        self.test_config = test_config
+
+        self.model = None
+        self.inputs = None
+        self.dtype = None
+        self.get_model_and_inputs()  # Sets self.model, self.inputs, and self.dtype
+        assert self.model is not None
+        assert self.inputs is not None
+        assert self.dtype is not None
+
+        self.stance = (
+            torch.compiler.set_stance("force_eager")
+            if test_config == "eager"
+            else nullcontext()
+        )
+
+    def get_model_and_inputs(self):
+        raise NotImplementedError("get_model_and_inputs() not implemented")
+
+    def run_inference(self):
+        raise NotImplementedError("run_inference() not implemented")
+
+
+class ASRBenchmark(Benchmark):
+    def run_inference(self):
+        torch.compiler.reset()
+        first_iteration = 0
+        total_real_time_factor = 0
+        for i in range(N_ITER):
+            sample = self.inputs[i]["audio"]
+            input_ids = self.processor(
+                sample["array"],
+                sampling_rate=sample["sampling_rate"],
+                return_tensors="pt",
+            )
+            input_ids["input_features"] = input_ids["input_features"].to(self.device)
+
+            device_sync(self.device)
+            start = time.time()
+            with self.stance:
+                _ = self.model.generate(**input_ids)
+            device_sync(self.device)
+            end = time.time()
+
+            input_length = len(sample["array"]) / sample["sampling_rate"]
+            real_time_factor = (end - start) / input_length
+            if i == 0:
+                first_iteration = real_time_factor
+            total_real_time_factor += real_time_factor
+
+        avg_real_time_factor = total_real_time_factor / N_ITER
+
+        experiment = Experiment(
+            name=self.model_name,
+            dtype=str(self.dtype),
+            device=self.device,
+            arch=get_arch_name(),
+            test_config=self.test_config,
+            compilation_time=first_iteration if self.test_config == "default" else 0,
+            real_time_factor=avg_real_time_factor,
+        )
+        return experiment
+
+
+class WhisperBenchmark(ASRBenchmark):
+    def get_model_and_inputs(self):
+        self.dtype = torch.float32
+        self.inputs = load_dataset("google/fleurs", "en_us", split="validation")
+
+        model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(
+            self.device
+        )
+        model.forward = torch.compile(model.forward)
+        model.config.forced_decoder_ids = None
+
+        self.processor = WhisperProcessor.from_pretrained(self.model_name)
+        self.model = model
+
+
+class TextGenerationBenchmark(Benchmark):
+    def __init__(self, model_name, device, test_config):
+        super().__init__(model_name, device, test_config)
+
+    def get_model_and_inputs(self):
+        self.dtype = torch.bfloat16
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            torch_dtype=self.dtype,
+        ).to(self.device)
+
+        model.forward = torch.compile(model.forward)
+        self.model = model
+
+        input_ids = self.tokenizer(
+            FRANCE_ARTICLE * batch_size_combinations[0], return_tensors="pt"
+        ).to(self.device)  # batch size =1 right now
+        generation_kwargs = {
+            "max_new_tokens": max_new_token_combinations[0],
+            "min_new_tokens": max_new_token_combinations[0],
+            "eos_token_id": None,
+            "do_sample": False,
+            # "cache_implementation": cache_implementation_combinations[0],
+        }
+        generation_config = copy.deepcopy(model.generation_config)
+        generation_config.update(**generation_kwargs)
+
+        input_ids["generation_config"] = generation_config
+        self.inputs = input_ids
+
+    def run_inference(self):
+        model_size = _get_model_size(self.model)
+
+        torch.compiler.reset()
+        first_iteration = 0
+        total_time = 0
+        total_tokens_per_second = 0
+        total_memory_bandwidth = 0
+        for i in range(N_ITER):
+            device_sync(self.device)
+            start = time.time()
+            with self.stance:
+                gen_out = self.model.generate(**self.inputs)
+            device_sync(self.device)
+            end = time.time()
+            if i == 0:
+                first_iteration = end - start
+            total_time += end - start
+            num_tokens = len(gen_out[0]) - len(self.inputs[0])
+            tokens_per_second = num_tokens / (end - start)
+            total_tokens_per_second += tokens_per_second
+            total_memory_bandwidth += model_size * tokens_per_second / 1e9
+
+        avg_tokens_per_second = total_tokens_per_second / N_ITER
+        avg_memory_bandwidth = total_memory_bandwidth / N_ITER
+
+        return Experiment(
+            name=self.model_name,
+            dtype=str(self.dtype),
+            device=self.device,
+            arch=get_arch_name(),
+            test_config=self.test_config,
+            compilation_time=first_iteration if self.test_config == "default" else 0,
+            tokens_per_second=avg_tokens_per_second,
+            memory_bandwidth=avg_memory_bandwidth,
+        )
+
+
+def test_export_aot_inductor():
+    pass
+
+
+################################################################
+
+test_configs = {
+    "eager",
+    "default",
+    # "export-aot-inductor": test_export_aot_inductor,
+}
--- a/benchmarks/llm_benchmarks/model_zoo.py
+++ b/benchmarks/llm_benchmarks/model_zoo.py
@ -0,0 +1,10 @@
+from generate import Benchmark, TextGenerationBenchmark, WhisperBenchmark
+
+
+models: dict[str, Benchmark] = {
+    "meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
+    "google/gemma-2-2b": TextGenerationBenchmark,
+    "google/gemma-3-4b-it": TextGenerationBenchmark,
+    "openai/whisper-tiny": WhisperBenchmark,
+    "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
+}
--- a/benchmarks/llm_benchmarks/prompts.py
+++ b/benchmarks/llm_benchmarks/prompts.py
@ -0,0 +1,69 @@
+FRANCE_ARTICLE = (
+    "<s>Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight "
+    "9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille "
+    'prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A '
+    "person who has such a video needs to immediately give it to the investigators.\" Robin's comments follow claims "
+    "by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final "
+    "seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. "
+    "Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two "
+    "publications described the supposed video, but did not post it on their websites. The publications said that "
+    "they watched the video, which was found by a source close to the investigation. \"One can hear cries of 'My God' "
+    'in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, '
+    "perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy "
+    'shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," '
+    "said Julian Reichelt, editor-in-chief of Bild online. An official with France's accident investigation agency, "
+    "the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie "
+    "spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the "
+    'reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, '
+    'but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent '
+    "to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized "
+    "technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent "
+    "to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card "
+    'to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he '
+    'had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip '
+    "is real. He noted that investigators only revealed they'd recovered cell phones from the crash site after "
+    'Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can '
+    "say many things of the investigation weren't revealed by the investigation at the beginning,\" he said. What "
+    "was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas "
+    "Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he's "
+    "accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training "
+    'school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email '
+    "correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, "
+    "included medical documents he submitted in connection with resuming his flight training. The announcement "
+    "indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz's battle with depression, allowed "
+    "him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously "
+    'said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and '
+    "said it was sharing the information and documents -- including training and medical records -- with public "
+    "prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past "
+    "week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center "
+    "set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving "
+    "families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human "
+    "remains were left at the site but recovery teams would keep searching. French President Francois Hollande, "
+    "speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the "
+    "end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the "
+    "victims' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be "
+    "more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our "
+    "correspondents . The details about Lubitz's correspondence with the flight school during his training were "
+    "among several developments as investigators continued to delve into what caused the crash and Lubitz's "
+    "possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid "
+    'medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a '
+    "spokesman for the prosecutor's office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz "
+    "suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before "
+    "he got his pilot's license. Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting "
+    "aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition "
+    "would cause him to lose his pilot's license, a European government official briefed on the investigation told "
+    'CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being '
+    "considered. Another source, a law enforcement official briefed on the investigation, also told CNN that "
+    "authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not "
+    "be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had seen an eye "
+    "doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had "
+    "psychological issues, the European government official said. But no matter what details emerge about his "
+    "previous mental health struggles, there's more to the story, said Brian Russell, a forensic psychologist. "
+    '"Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they '
+    "weren't going to keep doing their job and they're upset about that and so they're suicidal,\" he said. \"But "
+    "there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it "
+    "outward on 149 other people who had nothing to do with the person's problems.\" Germanwings crash compensation: "
+    "What we know . Who was the captain of Germanwings Flight 9525? CNN's Margot Haddad reported from Marseille and "
+    "Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela "
+    "Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report."
+)