[Bugfix] Mistral tool calling when content is list (#18729 )

Signed-off-by: mgoin <mgoin64@gmail.com>
[Core] Automatically cast multi-modal input dtype (#18756 )
2025-11-04 17:34:34 +08:00 · 2025-05-27 09:05:37 -07:00 · 2025-05-27 23:45:48 +08:00 · 2025-05-27 13:08:44 +00:00 · 2025-05-27 20:06:34 +08:00 · 2025-05-27 09:37:06 +00:00
1682 changed files with 144591 additions and 53120 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -8,12 +8,12 @@ import zipfile
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))


 def print_top_10_largest_files(zip_file):
    """Print the top 10 largest files in the given zip file."""
-    with zipfile.ZipFile(zip_file, 'r') as z:
+    with zipfile.ZipFile(zip_file, "r") as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
@ -28,14 +28,18 @@ def check_wheel_size(directory):
                wheel_path = os.path.join(root, file_name)
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(f"Not allowed: Wheel {wheel_path} is larger "
-                          f"({wheel_size_mb:.2f} MB) than the limit "
-                          f"({VLLM_MAX_SIZE_MB} MB).")
+                    print(
+                        f"Not allowed: Wheel {wheel_path} is larger "
+                        f"({wheel_size_mb:.2f} MB) than the limit "
+                        f"({VLLM_MAX_SIZE_MB} MB)."
+                    )
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
-                    print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb:.2f} MB).")
+                    print(
+                        f"Wheel {wheel_path} is within the allowed size "
+                        f"({wheel_size_mb:.2f} MB)."
+                    )
    return 0


--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -22,5 +22,5 @@ with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # cloudfront requires escaping the '+' character
    f.write(
-        template.format(wheel=filename,
-                        wheel_html_escaped=filename.replace("+", "%2B")))
+        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+    )
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,3 +1,4 @@
+# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,3 +1,4 @@
+# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
+model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.335
+  - name: "exact_match,flexible-extract"
+    value: 0.323
+limit: 1319
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.30
+  - name: "exact_match,flexible-extract"
+    value: 0.465
+limit: 1319
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
+model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.54
+  - name: "exact_match,flexible-extract"
+    value: 0.59
+limit: 1319
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.47
+  - name: "exact_match,flexible-extract"
+    value: 0.64
+limit: 1319
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
+Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,10 +1,6 @@
-Meta-Llama-3-8B-Instruct.yaml
-Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+Qwen2.5-1.5B-Instruct.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
-Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
-Qwen2-1.5B-Instruct-FP8W8.yaml
-Meta-Llama-3-8B-QQQ.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--config-list-file",
+        action="store",
+        help="Path to the file listing model config YAMLs (one per line)",
+    )
+    parser.addoption(
+        "--tp-size",
+        action="store",
+        default="1",
+        help="Tensor parallel size to use for evaluation",
+    )
+
+
+@pytest.fixture(scope="session")
+def config_list_file(pytestconfig, config_dir):
+    rel_path = pytestconfig.getoption("--config-list-file")
+    return config_dir / rel_path
+
+
+@pytest.fixture(scope="session")
+def tp_size(pytestconfig):
+    return pytestconfig.getoption("--tp-size")
+
+
+def pytest_generate_tests(metafunc):
+    if "config_filename" in metafunc.fixturenames:
+        rel_path = metafunc.config.getoption("--config-list-file")
+        config_list_file = Path(rel_path).resolve()
+        config_dir = config_list_file.parent
+        with open(config_list_file, encoding="utf-8") as f:
+            configs = [
+                config_dir / line.strip()
+                for line in f
+                if line.strip() and not line.startswith("#")
+            ]
+        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -1,59 +0,0 @@
-#!/bin/bash
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using vllm and compares to "
-    echo "precomputed baseline (measured by HF transformers.)"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
-    echo "  -t    - tensor parallel size"
-    echo
-}
-
-SUCCESS=0
-
-while getopts "c:t:" OPT; do
-  case ${OPT} in
-    c ) 
-        CONFIG="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
-
-for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
-do
-    LOCAL_SUCCESS=0
-    
-    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
-
-    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
-    export LM_EVAL_TP_SIZE=$TP_SIZE
-    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
-
-    if [[ $LOCAL_SUCCESS == 0 ]]; then
-        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
-    else
-        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
-    fi
-
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
-
-done
-
-if [ "${SUCCESS}" -eq "0" ]; then
-    exit 0
-else
-    exit 1
-fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -3,67 +3,52 @@
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml

-* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-* export LM_EVAL_TP_SIZE=4 
-* pytest -s test_lm_eval_correctness.py
+pytest -s -v test_lm_eval_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
 """

-import os
-from pathlib import Path
-
 import lm_eval
-import numpy
-import pytest
+import numpy as np
 import yaml

-RTOL = 0.05
-TEST_DATA_FILE = os.environ.get(
-    "LM_EVAL_TEST_DATA_FILE",
-    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
-
-TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
+RTOL = 0.08


-def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-
-    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true," \
-                 f"trust_remote_code={trust_remote_code}"
-
+def launch_lm_eval(eval_config, tp_size):
+    trust_remote_code = eval_config.get("trust_remote_code", False)
+    model_args = (
+        f"pretrained={eval_config['model_name']},"
+        f"tensor_parallel_size={tp_size},"
+        f"enforce_eager=true,"
+        f"add_bos_token=true,"
+        f"trust_remote_code={trust_remote_code}"
+    )
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto")
-
+        batch_size="auto",
+    )
    return results


-def test_lm_eval_correctness():
-    eval_config = yaml.safe_load(
-        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+def test_lm_eval_correctness_param(config_filename, tp_size):
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))

-    if eval_config[
-            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
-        pytest.skip("FBGEMM is currently failing on main.")
+    results = launch_lm_eval(eval_config, tp_size)

-    # Launch eval requests.
-    results = launch_lm_eval(eval_config)
-
-    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
-            print(f'{task["name"]} | {metric["name"]}: '
-                  f'ground_truth={ground_truth} | measured={measured_value}')
-            success = success and numpy.isclose(
-                ground_truth, measured_value, rtol=RTOL)
+            print(
+                f"{task['name']} | {metric['name']}: "
+                f"ground_truth={ground_truth} | measured={measured_value}"
+            )
+            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)

-    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -65,18 +65,18 @@ def read_markdown(file):


 def results_to_json(latency, throughput, serving):
-    return json.dumps({
-        'latency': latency.to_dict(),
-        'throughput': throughput.to_dict(),
-        'serving': serving.to_dict()
-    })
+    return json.dumps(
+        {
+            "latency": latency.to_dict(),
+            "throughput": throughput.to_dict(),
+            "serving": serving.to_dict(),
+        }
+    )


 if __name__ == "__main__":
-
    # collect results
    for test_file in results_folder.glob("*.json"):
-
        with open(test_file) as f:
            raw_result = json.loads(f.read())

@ -120,7 +120,8 @@ if __name__ == "__main__":
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000

            # add the result to raw_result
@ -153,26 +154,27 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)

-    raw_results_json = results_to_json(latency_results, throughput_results,
-                                       serving_results)
+    raw_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )

    # remapping the key, for visualization purpose
    if not latency_results.empty:
-        latency_results = latency_results[list(
-            latency_column_mapping.keys())].rename(
-                columns=latency_column_mapping)
+        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
+            columns=latency_column_mapping
+        )
    if not serving_results.empty:
-        serving_results = serving_results[list(
-            serving_column_mapping.keys())].rename(
-                columns=serving_column_mapping)
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )
    if not throughput_results.empty:
-        throughput_results = throughput_results[list(
-            throughput_results_column_mapping.keys())].rename(
-                columns=throughput_results_column_mapping)
+        throughput_results = throughput_results[
+            list(throughput_results_column_mapping.keys())
+        ].rename(columns=throughput_results_column_mapping)

-    processed_results_json = results_to_json(latency_results,
-                                             throughput_results,
-                                             serving_results)
+    processed_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )

    for df in [latency_results, serving_results, throughput_results]:
        if df.empty:
@ -184,38 +186,39 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+        )

    # get markdown tables
-    latency_md_table = tabulate(latency_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    serving_md_table = tabulate(serving_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    throughput_md_table = tabulate(throughput_results,
-                                   headers='keys',
-                                   tablefmt='pipe',
-                                   showindex=False)
+    latency_md_table = tabulate(
+        latency_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    serving_md_table = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    throughput_md_table = tabulate(
+        throughput_results, headers="keys", tablefmt="pipe", showindex=False
+    )

    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:
-
-        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-                                "performance-benchmarks-descriptions.md")
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/"
+            + "performance-benchmarks-descriptions.md"
+        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
-            benchmarking_results_in_json_string=processed_results_json)
+            benchmarking_results_in_json_string=processed_results_json,
+        )
        f.write(results)

    # document benchmarking results in json
    with open(results_folder / "benchmark_results.json", "w") as f:
-
-        results = latency_results.to_dict(
-            orient='records') + throughput_results.to_dict(
-                orient='records') + serving_results.to_dict(orient='records')
+        results = (
+            latency_results.to_dict(orient="records")
+            + throughput_results.to_dict(orient="records")
+            + serving_results.to_dict(orient="records")
+        )
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -14,15 +14,12 @@ def main(model, cachedir):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer")
-    parser.add_argument("--model",
-                        type=str,
-                        required=True,
-                        help="Name of the model")
-    parser.add_argument("--cachedir",
-                        type=str,
-                        required=True,
-                        help="Directory to save the tokenizer")
+        description="Download and save Hugging Face tokenizer"
+    )
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+    parser.add_argument(
+        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
+    )

    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -11,33 +11,33 @@ from tabulate import tabulate

 def parse_arguments():
    parser = argparse.ArgumentParser(
-        description=
-        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder',
-                        type=str,
-                        required=True,
-                        help='The folder where the results are stored.')
-    parser.add_argument('--description',
-                        type=str,
-                        required=True,
-                        help='Description of the results.')
+        description="Parse command line arguments for summary-nightly-results script."
+    )
+    parser.add_argument(
+        "--results-folder",
+        type=str,
+        required=True,
+        help="The folder where the results are stored.",
+    )
+    parser.add_argument(
+        "--description", type=str, required=True, help="Description of the results."
+    )

    args = parser.parse_args()
    return args


 def get_perf(df, method, model, metric):
-
    means = []

    for qps in [2, 4, 8, 16, "inf"]:
-        target = df['Test name'].str.contains(model)
-        target = target & df['Engine'].str.contains(method)
-        target = target & df['Test name'].str.contains("qps_" + str(qps))
+        target = df["Test name"].str.contains(model)
+        target = target & df["Engine"].str.contains(method)
+        target = target & df["Test name"].str.contains("qps_" + str(qps))
        filtered_df = df[target]

        if filtered_df.empty:
-            means.append(0.)
+            means.append(0.0)
        else:
            means.append(filtered_df[metric].values[0])

@ -45,7 +45,6 @@ def get_perf(df, method, model, metric):


 def get_perf_w_std(df, method, model, metric):
-
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric):
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)")
+            df, method, model, "Output Tput (tok/s)"
+        )
        mean = mean.tolist()
        std = None

@ -80,18 +80,17 @@ def main(args):
    # generate markdown table
    df = pd.DataFrame.from_dict(results)

-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)

    with open(args.description) as f:
        description = f.read()

-    description = description.format(
-        nightly_results_benchmarking_table=md_table)
+    description = description.format(nightly_results_benchmarking_table=md_table)

    with open("nightly_results.md", "w") as f:
        f.write(description)


-if __name__ == '__main__':
+if __name__ == "__main__":
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -10,15 +10,24 @@ set -x
 set -o pipefail

 check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if command -v nvidia-smi; then
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  elif command -v amd-smi; then
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  fi
+
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  if command -v nvidia-smi; then
+    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  elif command -v amd-smi; then
+    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  fi
  echo "GPU type is $gpu_type"
 }

@ -90,9 +99,15 @@ kill_gpu_processes() {


  # wait until GPU memory usage smaller than 1GB
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
+  if command -v nvidia-smi; then
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+      sleep 1
+    done
+  elif command -v amd-smi; then
+    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
+      sleep 1
+    done
+  fi

  # remove vllm config file
  rm -rf ~/.config/vllm
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -34,10 +34,8 @@ serving_column_mapping = {
 }

 if __name__ == "__main__":
-
    # collect results
    for test_file in results_folder.glob("*.json"):
-
        with open(test_file) as f:
            raw_result = json.loads(f.read())

@ -56,17 +54,16 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)

    if not serving_results.empty:
-        serving_results = serving_results[list(
-            serving_column_mapping.keys())].rename(
-                columns=serving_column_mapping)
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )

-    serving_md_table_with_headers = tabulate(serving_results,
-                                             headers='keys',
-                                             tablefmt='pipe',
-                                             showindex=False)
+    serving_md_table_with_headers = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split('\n')
-    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+    serving_md_table_lines = serving_md_table_with_headers.split("\n")
+    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])

    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
@ -76,10 +73,9 @@ if __name__ == "__main__":
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
-        f.write('\n')
+        f.write("\n")

    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-
-        results = serving_results.to_dict(orient='records')
+        results = serving_results.to_dict(orient="records")
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -64,9 +64,11 @@
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -0,0 +1,46 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.format]
+docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,23 +1,23 @@
 steps:
-  - label: "Build wheel - CUDA 12.4"
+  - label: "Build wheel - CUDA 12.8"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA 12.6"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -31,10 +31,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build and publish TPU release image"
@ -57,12 +57,14 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "yes | docker system prune -a"
+      - "git fetch --all"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
      - docker-login#v3.0.0:
-          username: vllm
+          username: vllmbot
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
@ -82,7 +84,22 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
+
+  - block: "Build Neuron release image"
+    key: block-neuron-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish Neuron release image"
+    depends_on: block-neuron-release-image-build
+    agents:
+      queue: neuron-postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && export VLLM_USE_V1=1 \
-    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo TEST_1 \
-    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
-    && echo TEST_2 \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
-    && echo TEST_3 \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
-    && echo TEST_4 \
-    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && echo TEST_5 \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
-    && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
-
-
-# TODO: This test fails because it uses RANDOM_SEED sampling
-# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -3,6 +3,9 @@
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail

+# Export Python path
+export PYTHONPATH=".."
+
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
@ -74,50 +77,102 @@ HF_MOUNT="/root/.cache/huggingface"

 commands=$@
 echo "Commands:$commands"
+
+if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
+fi
+
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+fi
+
+if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
+  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
+fi
+
+if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
+fi
+
 #ignore certain kernels tests
-if [[ $commands == *" kernels "* ]]; then
+if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
-  --ignore=kernels/test_attention_selector.py \
-  --ignore=kernels/test_blocksparse_attention.py \
-  --ignore=kernels/test_causal_conv1d.py \
-  --ignore=kernels/test_cutlass.py \
-  --ignore=kernels/test_encoder_decoder_attn.py \
-  --ignore=kernels/test_flash_attn.py \
-  --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_int8_quant.py \
-  --ignore=kernels/test_machete_gemm.py \
-  --ignore=kernels/test_mamba_ssm.py \
-  --ignore=kernels/test_marlin_gemm.py \
-  --ignore=kernels/test_moe.py \
-  --ignore=kernels/test_prefix_prefill.py \
-  --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py \
-  --ignore=kernels/test_cascade_flash_attn.py \
-  --ignore=kernels/test_mamba_mixer2.py \
-  --ignore=kernels/test_aqlm.py \
-  --ignore=kernels/test_machete_mm.py \
-  --ignore=kernels/test_mha_attn.py \
-  --ignore=kernels/test_block_fp8.py \
-  --ignore=kernels/test_permute_cols.py"
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi
+
+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_blocksparse_attention.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_aqlm.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi

 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_chat.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi

 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@ -126,6 +181,8 @@ fi


 PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
+
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
@ -134,9 +191,10 @@ if [[ $commands == *"--shard-id="* ]]; then
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
-        --device /dev/kfd --device /dev/dri \
-        --network host \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
@ -145,6 +203,7 @@ if [[ $commands == *"--shard-id="* ]]; then
        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
@ -163,9 +222,10 @@ if [[ $commands == *"--shard-id="* ]]; then
    fi
  done
 else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-          --device /dev/kfd --device /dev/dri \
-          --network host \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
@ -174,6 +234,7 @@ else
          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Setup cleanup
+remove_docker_container() {
+  if [[ -n "$container_id" ]]; then
+      podman rm -f "$container_id" || true
+  fi
+  podman system prune -f
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
+
+# Run the image
+container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
+
+function cpu_tests() {
+
+  # offline inference
+  podman exec -it "$container_id" bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run basic model test
+  podman exec -it "$container_id" bash -c "
+    set -e
+    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
+    pip install sentence-transformers datamodel_code_generator
+    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+
+export container_id
+export -f cpu_tests
+timeout 40m bash -c cpu_tests
+
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@ -10,5 +10,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
-
+docker build -t cpu-test -f docker/Dockerfile.s390x .
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -8,15 +8,19 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
-
 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { 
+    set -e; 
+    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
+    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+}
 trap remove_docker_container EXIT
 remove_docker_container

+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -36,8 +40,6 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pip install -r vllm/requirements/test.txt
-    pip install -r vllm/requirements/cpu.txt
    pytest -v -s tests/kernels/test_cache.py -m cpu_model
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -9,6 +9,7 @@ python3 use_existing_torch.py

 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -5,20 +5,22 @@
 set -ex

 # Try building the docker image
-docker build -t hpu-test-env -f Dockerfile.hpu .
+docker build -t hpu-test-env -f docker/Dockerfile.hpu .

 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
 # override the exit code of the script, so we need to use
-# separate remove_docker_container and remove_docker_container_and_exit
+# separate remove_docker_containers and remove_docker_containers_and_exit
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_container() { docker rm -f hpu-test || true; }
-remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
-trap remove_docker_container_and_exit EXIT
-remove_docker_container
+remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
+remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
+trap remove_docker_containers_and_exit EXIT
+remove_docker_containers

 # Run the image and launch offline inference
 docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
+
 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
+HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)

 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"

 # Try building the docker image
-aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws

 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
@ -35,7 +36,7 @@ else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t "${image_name}" -f Dockerfile.neuron .
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .

 # Setup cleanup
 remove_docker_container() {
@ -47,8 +48,16 @@ trap remove_docker_container EXIT
 docker run --rm -it --device=/dev/neuron0 --network bridge \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
+       -e "HF_TOKEN=${HF_TOKEN}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
+       /bin/bash -c "
+            python3 /workspace/vllm/examples/offline_inference/neuron.py;
+            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
+            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
+                echo 'Running test file: '$f;
+                python3 -m pytest \$f -v --capture=tee-sys;
+            done
+       "
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -0,0 +1,103 @@
+#!/bin/bash
+
+set -xu
+
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest pytest-asyncio tpu-info \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && export VLLM_XLA_CACHE_PATH= \
+    && export VLLM_USE_V1=1 \
+    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo HARDWARE \
+    && tpu-info \
+    && { \
+        echo TEST_0: Running test_perf.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
+        echo TEST_0_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_1: Running test_compilation.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
+        echo TEST_1_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_2: Running test_basic.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
+        echo TEST_2_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
+        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
+        echo TEST_3_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_4: Running test_quantization_accuracy.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
+        echo TEST_4_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_5: Running examples/offline_inference/tpu.py; \
+        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
+        echo TEST_5_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_6: Running test_tpu_model_runner.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
+        echo TEST_6_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_7: Running test_sampler.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
+        echo TEST_7_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_8: Running test_topk_topp_sampler.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
+        echo TEST_8_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_9: Running test_multimodal.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
+        echo TEST_9_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_10: Running test_pallas.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
+        echo TEST_10_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_11: Running test_struct_output_generate.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
+        echo TEST_11_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_12: Running test_moe_pallas.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
+        echo TEST_12_EXIT_CODE: \$?; \
+    } & \
+    # Disable the TPU LoRA tests until the feature is activated
+    # & { \
+    #     echo TEST_13: Running test_moe_pallas.py; \
+    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
+    #     echo TEST_13_EXIT_CODE: \$?; \
+    # } & \
+    wait \
+    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
+"
+
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t ${image_name} -f Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail

-# cd into parent directory of this file
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+# cd 2 levels into the working directory
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."

 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -3,7 +3,7 @@
 set -euox pipefail

 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi

--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -66,12 +66,13 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi

 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -8,6 +8,7 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@ -31,16 +32,17 @@ steps:
 ##### fast check tests  #####

 - label: Documentation Build # 2min
-  working_dir: "/vllm-workspace/test_docs/docs"
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/test_docs"
  fast_check: true
  no_gpu: True
  commands:
-  - pip install -r ../../requirements/docs.txt
-  - SPHINXOPTS=\"-W\" make html
-  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/inference_params.html
+  - pip install -r ../requirements/docs.txt
+  # TODO: add `--strict` once warnings in docstrings are fixed
+  - mkdocs build

 - label: Async Engine, Inputs, Utils, Worker Test # 24min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@ -56,11 +58,13 @@ steps:
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker

 - label: Python-only Installation Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
@ -68,8 +72,9 @@ steps:
  - bash standalone_tests/python_only_compile.sh

 - label: Basic Correctness Test # 30min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  fast_check: true
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
@ -84,6 +89,7 @@ steps:
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

 - label: Chunked Prefill Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
@ -92,7 +98,7 @@ steps:
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py

 - label: Core Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  fast_check: true
  source_file_dependencies:
  - vllm/core
@ -102,9 +108,10 @@ steps:
  - pytest -v -s core

 - label: Entrypoints Test # 40min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  mirror_hardwares: [amd]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@ -118,11 +125,12 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -130,29 +138,36 @@ steps:
  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/test_async_llm_dp.py
  commands:
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

 - label: Metrics, Tracing Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
@ -160,18 +175,13 @@ steps:
  - tests/tracing
  commands:
  - pytest -v -s metrics
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0,<1.27.0' \
-      'opentelemetry-api>=1.26.0,<1.27.0' \
-      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing

 ##### fast check tests  #####
 #####  1 GPU test  #####

 - label: Regression Test # 5min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -181,7 +191,7 @@ steps:
  working_dir: "/vllm-workspace/tests" # optional

 - label: Engine Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/engine
@ -195,22 +205,24 @@ steps:
  - pytest -v -s tokenization

 - label: V1 Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
-    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/engine
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_stats.py
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s v1/kv_connector/unit
+    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_metrics_reader.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
@ -219,8 +231,8 @@ steps:
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

 - label: Examples Test # 25min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
-  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
@ -235,7 +247,7 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_embedding.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
@ -244,7 +256,7 @@ steps:
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Prefix Caching Test # 9min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
@ -252,6 +264,7 @@ steps:
    - pytest -v -s prefix_caching

 - label: Samplers Test # 36min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
@ -262,7 +275,7 @@ steps:
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

 - label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/model_executor/guided_decoding
@ -273,6 +286,7 @@ steps:
    - pytest -v -s model_executor/test_guided_processors.py

 - label: Speculative decoding tests # 40min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
@ -283,14 +297,29 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4

+- label: PyTorch Compilation Unit Tests
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+    - pytest -v -s compile/test_pass_manager.py
+    - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s compile/test_sequence_parallelism.py
+    - pytest -v -s compile/test_async_tp.py
+
 - label: PyTorch Fullgraph Smoke Test # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -299,61 +328,112 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/test_pass_manager.py

 - label: PyTorch Fullgraph Test # 18min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+- label: Kernels Core Operation Test
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/
-  - vllm/attention
-  - tests/kernels
+  - tests/kernels/core
  commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+    - pytest -v -s kernels/core
+
+- label: Kernels Attention Test %N
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  commands:
+    - pytest -v -s kernels/moe
+
+- label: Kernels Mamba Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  commands:
+    - pytest -v -s kernels/mamba

 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
  - tests/tensorizer_loader
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s tensorizer_loader
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

 - label: Benchmarks # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/.buildkite"
-  mirror_hardwares: [amd]
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash run-benchmarks.sh
+  - bash scripts/run-benchmarks.sh

- label: Quantization Test # 33min
+- label: Benchmarks CLI Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
+
+- label: Quantization Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  commands:
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

 - label: OpenAI API correctness
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
@ -362,6 +442,7 @@ steps:
  - pytest -s entrypoints/openai/correctness/

 - label: Encoder Decoder tests # 5min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/encoder_decoder
@ -369,98 +450,117 @@ steps:
    - pytest -v -s encoder_decoder

 - label: OpenAI-Compatible Tool Use # 20 min
+  mirror_hardwares: [amdexperimental]
  fast_check: false
-  mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
+    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
+    - pytest -v -s mistral_tool_use

 #####  models test  #####

 - label: Basic Models Test # 24min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
+    - pytest -v -s models/test_initialization.py

- label: Language Models Test (Standard) # 32min
-  #mirror_hardwares: [amd]
+- label: Language Models Test (Standard)
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
  commands:
-    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/language -m core_model
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m core_model

- label: Language Models Test (Extended) # 1h10min
+- label: Language Models Test (Extended Generation) # 1hr20min
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language/generation
  commands:
-    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/language -m 'not core_model'
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language/generation -m 'not core_model'

- label: Multi-Modal Models Test (Standard) # 40min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/audio_language
-  - tests/models/encoder_decoder/vision_language
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal
-    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/vision_language -m core_model
-    - pytest -v -s models/encoder_decoder/audio_language -m core_model
-    - pytest -v -s models/encoder_decoder/language -m core_model
-    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-
- label: Multi-Modal Models Test (Extended) 1 # 48m
+- label: Language Models Test (Extended Pooling)  # 36min
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Multi-Modal Models Test (Standard)
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    # HACK - run phi3v tests separately to sidestep this transformers bug
-    # https://github.com/huggingface/transformers/issues/34307
-    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

- label: Multi-Modal Models Test (Extended) 2 # 38m
+- label: Multi-Modal Models Test (Extended) 1
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/vision_language
+  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+
+- label: Multi-Modal Models Test (Extended) 2
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
+  mirror_hardwares: [amdexperimental, amdproduction]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+- label: Quantized Models Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
+  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  commands:
    - echo 'Testing custom models...'
@ -472,6 +572,7 @@ steps:
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
+  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -482,6 +583,7 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
@ -500,7 +602,7 @@ steps:
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'

 - label: Distributed Tests (2 GPUs) # 40min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -514,31 +616,37 @@ steps:
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - vllm/v1/engine/
  commands:
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+  # test sequence parallel
+  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown

 - label: Plugin Tests (2 GPUs) # 40min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
-  # begin platform plugin tests, all the code in-between runs on dummy platform
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
@ -549,8 +657,10 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

 - label: Multi-step Tests (4 GPUs) # 36min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -571,6 +681,7 @@ steps:
  - pytest -v -s multi_step/test_correctness_llm.py

 - label: Pipeline Parallelism Test # 45min
+  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -584,6 +695,7 @@ steps:
  - pytest -v -s distributed/test_pipeline_parallel.py

 - label: LoRA TP Test (Distributed)
+  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
@ -592,17 +704,14 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # This test runs llama 13B, so it is required to run on 4 GPUs.
-    - pytest -v -s -x lora/test_long_context.py
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_minicpmv_tp.py
-    - pytest -v -s -x lora/test_transfomers_model.py


 - label: Weight Loading Multiple GPU Test  # 33min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -612,6 +721,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

 - label: Weight Loading Multiple GPU Test - Large Models # optional
+  mirror_hardwares: [amdexperimental] 
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
@ -650,4 +760,4 @@ steps:
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -12,6 +12,8 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 /vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
+/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/lora @jeejeelee
 CMakeLists.txt @tlrmchlsmth

 # vLLM V1
@ -39,3 +41,8 @@ CMakeLists.txt @tlrmchlsmth
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
 /tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
+/tests/lora @jeejeelee
+
+# Docs
+/docs @hmellor
+mkdocs.yaml @hmellor
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -14,14 +14,14 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
-      <summary>The output of `python collect_env.py`</summary>
+      <summary>The output of <code>python collect_env.py</code></summary>

      ```text
      Your output of `python collect_env.py` here
@ -75,20 +75,20 @@ body:
      ```

      ```
-      The error message you got, with the full traceback.
+      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
      ```
  validations:
    required: true
 - type: markdown
  attributes:
-    value: >
-      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
+    value: |
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:

      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).

      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.

-      Thanks for contributing 🎉!
+      Thanks for reporting 🙏!
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/ISSUE_TEMPLATE/450-ci-failure.yml
+++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml
@ -0,0 +1,69 @@
+name: 🧪 CI failure report
+description: Report a failing test.
+title: "[CI Failure]: "
+labels: ["ci-failure"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Include the name of the failing Buildkite step and test file in the title.
+- type: input
+  attributes:
+    label: Name of failing test
+    description: |
+      Paste in the fully-qualified name of the failing test from the logs.
+    placeholder: |
+      `path/to/test_file.py::test_name[params]`
+  validations:
+    required: true
+- type: checkboxes
+  attributes:
+    label: Basic information
+    description: Select all items that apply to the failing test.
+    options:
+      - label: Flaky test
+      - label: Can reproduce locally
+      - label: Caused by external libraries (e.g. bug in `transformers`)
+- type: textarea
+  attributes:
+    label: 🧪 Describe the failing test
+    description: |
+      Please provide a clear and concise description of the failing test.
+    placeholder: |
+      A clear and concise description of the failing test.
+  
+      ```
+      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 📝 History of failing test
+    description: |
+      Since when did the test start to fail?
+      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
+
+      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
+
+      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
+
+      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
+
+      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
+    placeholder: |
+      Approximate timeline and/or problematic PRs
+
+      A link to the Buildkite analytics of the failing test (if available)
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: CC List.
+    description: >
+      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for reporting 🙏!
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).

-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -35,7 +35,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)

 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -19,7 +19,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^Dockerfile
+      - files~=^docker/Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
@ -55,11 +55,19 @@ pull_request_rules:
  description: Automatically apply structured-output label
  conditions:
    - or:
+      - files~=^benchmarks/structured_schemas/
+      - files=benchmarks/benchmark_serving_structured_output.py
+      - files=benchmarks/run_structured_output_benchmark.sh
+      - files=docs/features/structured_outputs.md
+      - files=examples/offline_inference/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^vllm/model_executor/guided_decoding/
      - files=tests/model_executor/test_guided_processors.py
      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=benchmarks/benchmark_serving_guided.py
-      - files=benchmarks/benchmark_guided.py
+      - files~=^tests/v1/structured_output/
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
+      - files~=^vllm/v1/structured_output/
  actions:
    label:
      add:
@ -88,6 +96,56 @@ pull_request_rules:
      add:
        - v1

+- name: label-tpu
+  description: Automatically apply tpu label
+  # Keep this list in sync with `label-tpu-remove` conditions
+  conditions:
+    - or:
+      - files~=tpu.py
+      - files~=_tpu
+      - files~=tpu_
+      - files~=/tpu/
+      - files~=pallas
+  actions:
+    label:
+      add:
+        - tpu
+
+- name: label-tpu-remove
+  description: Automatically remove tpu label
+  # Keep this list in sync with `label-tpu` conditions
+  conditions:
+    - and:
+      - -files~=tpu.py
+      - -files~=_tpu
+      - -files~=tpu_
+      - -files~=/tpu/
+      - -files~=pallas
+  actions:
+    label:
+      remove:
+        - tpu
+
+- name: label-tool-calling
+  description: Automatically add tool-calling label
+  conditions:
+    - or:
+      - files~=^tests/tool_use/
+      - files~=^tests/mistral_tool_use/
+      - files~=^tests/entrypoints/openai/tool_parsers/
+      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/
+      - files=docs/features/tool_calling.md
+      - files~=^examples/tool_chat_*
+      - files=examples/offline_inference/chat_with_tools.py
+      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
+      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
+  actions:
+    label:
+      add:
+        - tool-calling
+
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
@ -103,6 +161,17 @@ pull_request_rules:

       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork

+- name: assign reviewer for tensorizer changes
+  conditions:
+      - files~=^vllm/model_executor/model_loader/tensorizer.py
+      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
+      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
+      - files~=^tests/tensorizer_loader/
+  actions:
+    assign:
+      users:
+        - "sangstar"
+
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"

 # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 python3 - <<EOF
-import re
+import regex as re

 with open("${NEW}", "r") as file:
    content = file.read()
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -1,4 +1,6 @@
 name: Add label on auto-merge enabled
+permissions:
+    pull-requests: write
 on:
    pull_request_target:
        types:
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -20,7 +20,12 @@ jobs:
        with:
          python-version: '3.12'

+      - name: Install Python dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install regex
+
      - name: Update PR description
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
+        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -2,6 +2,9 @@ name: Lint and Deploy Charts

 on: pull_request

+permissions:
+  contents: read
+
 jobs:
  lint-and-deploy:
    runs-on: ubuntu-latest
@ -50,7 +53,7 @@ jobs:
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0

      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .

      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -5,6 +5,9 @@ on:
  push:
    branches: [main]

+permissions:
+  contents: read
+
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -1,4 +1,6 @@
 name: PR Reminder Comment Bot
+permissions:
+  pull-requests: write
 on:
  pull_request_target:
    types: [opened]
--- a/.gitignore
+++ b/.gitignore
@ -3,7 +3,6 @@

 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
-!vllm/vllm_flash_attn/fa_utils.py

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -78,10 +77,6 @@ instance/
 # Scrapy stuff:
 .scrapy

-# Sphinx documentation
-docs/_build/
-docs/source/getting_started/examples/
-
 # PyBuilder
 .pybuilder/
 target/
@ -151,6 +146,7 @@ venv.bak/

 # mkdocs documentation
 /site
+docs/examples

 # mypy
 .mypy_cache/
@ -203,3 +199,6 @@ benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
+
+# Ingore moe/marlin_moe gen code
+csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,3 +1,6 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
@ -8,43 +11,45 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
-    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.3
+  rev: v0.11.7
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
+  - id: ruff-format
+    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.4.0
+  rev: v2.4.1
  hooks:
  - id: codespell
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
+  rev: 6.0.1
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v19.1.7
+  rev: v20.1.3
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.27
+  rev: v0.9.29
  hooks:
  - id: pymarkdown
+    exclude: '.*\.inc\.md'
    args: [fix]
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.2
+  rev: 0.6.17
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
@ -99,8 +104,8 @@ repos:
    args:
      - -c
      - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
        fi
    language: system
    verbose: true
@ -119,6 +124,25 @@ repos:
    language: system
    always_run: true
    pass_filenames: false
+  - id: update-dockerfile-graph
+    name: Update Dockerfile dependency graph
+    entry: tools/update-dockerfile-graph.sh
+    language: script
+  - id: enforce-import-regex-instead-of-re
+    name: Enforce import regex as re
+    entry: python tools/enforce_regex_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  # forbid directly import triton
+  - id: forbid-direct-triton-import
+    name: "Forbid direct 'import triton'"
+    entry: python tools/check_triton_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -8,12 +8,8 @@ build:
  tools:
    python: "3.12"

-sphinx:
-  configuration: docs/source/conf.py
-  fail_on_warning: true
-
-# If using Sphinx, optionally build your docs in additional formats such as PDF
-formats: []
+mkdocs:
+  configuration: mkdocs.yaml

 # Optionally declare the Python requirements required to build your docs
 python:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)

 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
-
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

@ -30,11 +29,8 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

-# Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
-
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -44,10 +40,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from Dockerfile.rocm
+# versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")

 #
 # Try to find python package with an executable that exactly matches
@ -80,6 +76,15 @@ endif()
 #
 find_package(Torch REQUIRED)

+# Supported NVIDIA architectures.
+# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
+else()
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+endif()
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -227,28 +232,34 @@ endif()
 #

 set(VLLM_EXT_SRC
+  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/mamba/causal_conv1d/causal_conv1d.cu"
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
+  "csrc/attention/vertical_slash_index.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
+  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
+  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

-  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  # Please keep this in sync with FetchContent_Declare line below.
-  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
+  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -266,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.8.0
+        GIT_TAG ${CUTLASS_REVISION}
        GIT_PROGRESS TRUE

        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -278,17 +289,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  FetchContent_MakeAvailable(cutlass)

  list(APPEND VLLM_EXT_SRC
-    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
-    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/cutlass_extensions/common.cpp")
+    "csrc/cutlass_extensions/common.cpp"
+    "csrc/attention/mla/cutlass_mla_entry.cu")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -297,10 +307,55 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
+
+    #
+    # For the Marlin kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=$PYTHONPATH
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE marlin_generation_result
+        OUTPUT_VARIABLE marlin_generation_result
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+      )
+
+      if (NOT marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin generation failed."
+                            " Result: \"${marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
+      else()
+        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        message(STATUS "Marlin generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
    set(MARLIN_SRCS
-       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@ -372,6 +427,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -396,8 +452,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
+  # (Build 8.9 for FP8)
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@ -448,7 +505,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
@ -461,6 +520,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(FP4_ARCHS)
  endif()

+  # CUTLASS MLA Archs and flags
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
+    set(SRCS
+      "csrc/attention/mla/cutlass_mla_kernels.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${MLA_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
+    # Add MLA-specific include directories only to MLA source files
+    set_source_files_properties(${SRCS}
+      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
+    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
+  else()
+    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
+    # clear MLA_ARCHS
+    set(MLA_ARCHS)
+  endif()
+
+  # CUTLASS MoE kernels
+
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
+  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
+  # to compile MoE kernels that use its output.
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
+             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
+    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                     "if you intend on running FP8 quantized MoE models on Hopper.")
+    else()
+      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
  #
  # Machete kernels

@ -578,23 +683,54 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")

  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
-    set(MARLIN_MOE_SRC
-        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
-        "csrc/moe/marlin_moe_ops.cu")

+    #
+    # For the Marlin MOE kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MOE_MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
+    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=$PYTHONPATH
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE moe_marlin_generation_result
+        OUTPUT_VARIABLE moe_marlin_generation_output
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+      )
+
+      if (NOT moe_marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin MOE generation failed."
+                            " Result: \"${moe_marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
+      else()
+        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
+        message(STATUS "Marlin MOE generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
+      SRCS "${MOE_WNAA16_MARLIN_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")

-    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
+    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -602,6 +738,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
 endif()

+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(MOE_PERMUTE_SRC
+      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+      "csrc/moe/moe_permute_unpermute_op.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_PERMUTE_SRC}"
+    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
+
+  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+endif()
 message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
@ -610,6 +757,8 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)

@ -619,6 +768,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  set(VLLM_ROCM_EXT_SRC
    "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")

  define_gpu_extension_target(
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,3 +1,3 @@
 # Contributing to vLLM

-You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -1,69 +0,0 @@
-# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
-
-FROM ubuntu:22.04 AS cpu-test-1
-
-ENV CCACHE_DIR=/root/.cache/ccache
-
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# intel-openmp provides additional performance improvement vs. openmp
-# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp==2025.0.1
-
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-RUN pip install intel_extension_for_pytorch==2.6.0
-
-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
-
-FROM cpu-test-1 AS build
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    pip install -v -r requirements/cpu.txt
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
-
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -e tests/vllm_test_utils
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 <p align="center">
  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
-    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
+    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
  </picture>
 </p>

@ -10,29 +10,26 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

 ---

-[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
-
-[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
-
---
-
 *Latest News* 🔥
-
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
+- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
+- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!

 <details>
 <summary>Previous News</summary>

+- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
+- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
+- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
+- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
@ -61,7 +58,7 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
 - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
 - Speculative decoding
 - Chunked prefill
@ -77,7 +74,7 @@ vLLM is flexible and easy to use with:
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
 - Prefix caching support
- Multi-lora support
+- Multi-LoRA support

 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
@ -103,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing

 We welcome and value any contributions and collaborations.
-Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.

 ## Sponsors

 vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!

 <!-- Note: Please sort them in alphabetical order. -->
-<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
+<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
 Cash Donations:
 - a16z
 - Dropbox
@ -126,6 +123,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -41,29 +41,39 @@ become available.
      <td><code>synthetic</code></td>
    </tr>
    <tr>
-      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">🟡</td>
-      <td style="text-align: center;">🟡</td>
-      <td>Specify your dataset path on HuggingFace</td>
+      <td><strong>HuggingFace-VisionArena</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmarena-ai/VisionArena-Chat</code></td>
    </tr>
    <tr>
-      <td><strong>VisionArena</strong></td>
+      <td><strong>HuggingFace-InstructCoder</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
+      <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+      <tr>
+      <td><strong>HuggingFace-AIMO</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Other</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
    </tr>
  </tbody>
 </table>

 ✅: supported

+🟡: Partial support
+
 🚧: to be supported

-🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
-If you need support for other dataset formats, please consider contributing.
-
-**Note**: VisionArena’s `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`

 ---
 ## Example - Online Benchmark
@ -71,8 +81,7 @@ If you need support for other dataset formats, please consider contributing.
 First start serving your model

 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-vllm serve ${MODEL_NAME} --disable-log-requests
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```

 Then run the benchmarking script
@ -80,12 +89,13 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 10
 ```

 If successful, you will see the following output
@ -122,88 +132,104 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT='train'
-
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 1000
 ```

-### HuggingFaceDataset Examples
+### InstructCoder Benchmark with Speculative Decoding

-Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
-formats, please consider contributing.
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --ngram_prompt_lookup_min 2 \
+    --ngram-prompt-lookup-max 5 \
+    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
+```
+
+``` bash
+python3 benchmarks/benchmark_serving.py \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name hf \
+    --dataset-path likaixin/InstructCoder \
+    --num-prompts 2048
+```
+
+### Other HuggingFaceDataset Examples

 ```bash
-# need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 **`lmms-lab/LLaVA-OneVision-Data`**

 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
-DATASET_SPLIT='train'
-DATASET_SUBSET='chart2text(cauldron)'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-subset "${DATASET_SUBSET}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
 ```

 **`Aeala/ShareGPT_Vicuna_unfiltered`**

 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
-DATASET_SPLIT='train'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+**`AI-MO/aimo-validation-aime`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
+### Running With Sampling Parameters
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
 ```

 ---
 ## Example - Offline Throughput Benchmark

 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-DATASET_NAME="sonnet"
-DATASET_PATH="vllm/benchmarks/sonnet.txt"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset-name sonnet \
+  --dataset-path vllm/benchmarks/sonnet.txt \
+  --num-prompts 10
 ```

 If successful, you will see the following output
@ -217,19 +243,13 @@ Total num output tokens:  1500
 ### VisionArena Benchmark for Vision Language Models

 ``` bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT="train"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "vllm-chat" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-split "${DATASET_SPLIT}"
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --num-prompts 1000 \
+  --hf-split train
 ```

 The `num prompt tokens` now includes image token counts
@ -240,29 +260,82 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```

+### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+VLLM_USE_V1=1 \
+python3 vllm/benchmarks/benchmark_throughput.py \
+    --dataset-name=hf \
+    --dataset-path=likaixin/InstructCoder \
+    --model=meta-llama/Meta-Llama-3-8B-Instruct \
+    --input-len=1000 \
+    --output-len=100 \
+    --num-prompts=2048 \
+    --async-engine \
+    --ngram_prompt_lookup_min=2 \
+    --ngram-prompt-lookup-max=5 \
+    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
+```
+
+```
+Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
+Total num prompt tokens:  261136
+Total num output tokens:  204800
+```
+
+### Other HuggingFaceDataset Examples
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+**`AI-MO/aimo-validation-aime`**
+
+```bash
+python3 benchmarks/benchmark_throughput.py \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
 ### Benchmark with LoRA Adapters

 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="meta-llama/Llama-2-7b-hf"
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-NUM_PROMPTS=10
-MAX_LORAS=2
-MAX_LORA_RANK=8
-ENABLE_LORA="--enable-lora"
-LORA_PATH="yard1/llama-2-7b-sql-lora-test"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "${BACKEND}" \
-  --dataset_path "${DATASET_PATH}" \
-  --dataset_name "${DATASET_NAME}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --max-loras "${MAX_LORAS}" \
-  --max-lora-rank "${MAX_LORA_RANK}" \
-  ${ENABLE_LORA} \
-  --lora-path "${LORA_PATH}"
+  --model meta-llama/Llama-2-7b-hf \
+  --backend vllm \
+  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --dataset_name sharegpt \
+  --num-prompts 10 \
+  --max-loras 2 \
+  --max-lora-rank 8 \
+  --enable-lora \
+  --lora-path yard1/llama-2-7b-sql-lora-test
  ```
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@ -0,0 +1,212 @@
+#!/bin/bash
+
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# The current server parameter combination is  max_num_seqs and max_num_batched_tokens
+# It also supports additional requirement: e2e latency and prefix cache. 
+
+# Pre-requisite:
+# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. 
+# 2. If the model is customized, replace the MODEL's config with the customized config.
+# 3. Set variables (ALL REQUIRED)
+#   BASE: your directory for vllm repo
+#   MODEL: the model served by vllm
+#   DOWNLOAD_DIR: directory to download and load model weights.
+#   INPUT_LEN: request input len
+#   OUTPUT_LEN: request output len
+#   MIN_CACHE_HIT_PCT: prefix cache rate
+#   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
+# 5. The final result will be saved in RESULT file. 
+
+
+# Example use cases 
+# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
+# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
+# 3. If we want to reach 60% prefix cache, what's the best server parameter? 
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
+
+TAG=$(date +"%Y_%m_%d_%H_%M")
+BASE=""
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
+DOWNLOAD_DIR=""
+INPUT_LEN=4000
+OUTPUT_LEN=16
+MIN_CACHE_HIT_PCT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000
+
+LOG_FOLDER="$BASE/auto-benchmark/$TAG"
+RESULT="$LOG_FOLDER/result.txt"
+
+echo "result file$ $RESULT"
+echo "model: $MODEL"
+echo
+
+rm -rf $LOG_FOLDER
+mkdir -p $LOG_FOLDER
+
+cd "$BASE/vllm"
+# create sonnet-4x.txt so that we can sample 2048 tokens for input
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+do
+cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+pip install datasets
+
+current_hash=$(git rev-parse HEAD)
+echo "hash:$current_hash" >> "$RESULT"
+echo "current_hash: $current_hash"
+
+best_throughput=0
+best_max_num_seqs=0
+best_num_batched_tokens=0
+best_goodput=0
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+
+    # start the server
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+        --disable-log-requests \
+        --port 8004 \
+        --gpu-memory-utilization 0.98 \
+        --max-num-seqs $max_num_seqs \
+        --max-num-batched-tokens $max_num_batched_tokens \
+        --tensor-parallel-size 1 \
+        --enable-prefix-caching \
+        --load-format dummy \
+        --download-dir $DOWNLOAD_DIR \
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+    echo "wait for 10 minutes.."
+    echo
+    # wait for 10 minutes...
+    server_started=0
+    for i in {1..60}; do        
+        if grep -Fq "Application startup complete" "$vllm_log"; then
+            echo "Application started"
+            server_started=1
+            break
+        else
+            # echo "wait for 10 seconds..."
+            sleep 10
+        fi
+    done
+ 
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
+        echo "pkill -f vllm"
+        echo
+        pkill vllm
+        sleep 10
+        return 1
+    fi
+    
+    echo "run benchmark test..."
+    echo
+    meet_latency_requirement=0
+    # get a basic qps by using request-rate inf
+    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    python benchmarks/benchmark_serving.py \
+        --backend vllm \
+        --model $MODEL  \
+        --dataset-name sonnet \
+        --dataset-path benchmarks/sonnet_4x.txt \
+        --sonnet-input-len $INPUT_LEN \
+        --sonnet-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate inf \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 100 \
+        --sonnet-prefix-len $prefix_len \
+        --port 8004 > "$bm_log"
+    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+
+    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+        meet_latency_requirement=1
+    fi
+
+    if (( ! meet_latency_requirement )); then
+    # start from request-rate as int(through_put) + 1
+        request_rate=$((${through_put%.*} + 1))
+        while ((request_rate > 0)); do
+            # clear prefix cache
+            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            sleep 5
+            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
+            python benchmarks/benchmark_serving.py \
+                --backend vllm \
+                --model $MODEL  \
+                --dataset-name sonnet \
+                --dataset-path benchmarks/sonnet_4x.txt \
+                --sonnet-input-len $INPUT_LEN \
+                --sonnet-output-len $OUTPUT_LEN \
+                --ignore_eos \
+                --disable-tqdm \
+                --request-rate $request_rate \
+                --percentile-metrics ttft,tpot,itl,e2el \
+                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+                --num-prompts 100 \
+                --sonnet-prefix-len $prefix_len \
+                --port 8004 > "$bm_log"
+            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+                meet_latency_requirement=1
+                break
+            fi
+            request_rate=$((request_rate-1))
+        done
+    fi
+    # write the results and update the best result.
+    if ((meet_latency_requirement)); then
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
+            best_throughput=$through_put
+            best_max_num_seqs=$max_num_seqs
+            best_num_batched_tokens=$max_num_batched_tokens
+            best_goodput=$goodput
+        fi
+    else
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
+    fi
+
+    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+
+    echo "pkill -f vllm"
+    echo
+    pkill vllm
+    sleep 10
+    rm -f $vllm_log
+    printf '=%.0s' $(seq 1 20)
+    return 0
+}
+
+
+num_seqs_list="128 256"
+num_batched_tokens_list="512 1024 2048 4096"
+for num_seqs in $num_seqs_list; do
+    for num_batched_tokens in $num_batched_tokens_list; do
+        run_benchmark $num_seqs $num_batched_tokens
+        exit 0
+    done
+done
+echo "finish permutations"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
+
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

+import io
 import json
 import os
 import sys
@ -11,8 +12,7 @@ from typing import Optional, Union
 import aiohttp
 import huggingface_hub.constants
 from tqdm.asyncio import tqdm
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

 # NOTE(simon): do not import vLLM here so the benchmark script
 # can run without vLLM installed.
@ -32,6 +32,7 @@ class RequestFuncInput:
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
+    language: Optional[str] = None


@dataclass
@ -41,8 +42,7 @@ class RequestFuncOutput:
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
-    itl: list[float] = field(
-        default_factory=list)  # list of inter-token latencies
+    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""
@ -55,8 +55,9 @@ async def async_request_tgi(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        params = {
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
@ -103,8 +104,7 @@ async def async_request_tgi(

                        # Decoding phase
                        else:
-                            output.itl.append(timestamp -
-                                              most_recent_timestamp)
+                            output.itl.append(timestamp - most_recent_timestamp)

                        most_recent_timestamp = timestamp

@ -131,8 +131,9 @@ async def async_request_trt_llm(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        payload = {
            "accumulate_tokens": True,
            "text_input": request_func_input.prompt,
@ -157,8 +158,7 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data:")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")

                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
@ -170,8 +170,7 @@ async def async_request_trt_llm(

                        # Decoding phase
                        else:
-                            output.itl.append(timestamp -
-                                              most_recent_timestamp)
+                            output.itl.append(timestamp - most_recent_timestamp)

                        most_recent_timestamp = timestamp

@ -195,15 +194,23 @@ async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )

+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        payload = {
+            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
            "max_tokens": request_func_input.output_len,
            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
            "top_p": 1.0,
        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

@ -214,12 +221,22 @@ async def async_request_deepspeed_mii(

        st = time.perf_counter()
        try:
-            async with session.post(url=request_func_input.api_url,
-                                    json=payload) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0]["text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = (
+                            "Unexpected response format: "
+                            "neither 'choices' nor 'text' found"
+                        )
+                        output.success = False
                    output.success = True
                else:
                    output.error = response.reason or ""
@ -239,17 +256,20 @@ async def async_request_openai_completions(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
+            "repetition_penalty": 1.0,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
@ -261,9 +281,7 @@ async def async_request_openai_completions(
            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
@ -272,8 +290,9 @@ async def async_request_openai_completions(
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
@ -281,8 +300,7 @@ async def async_request_openai_completions(
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            data = json.loads(chunk)

@ -302,21 +320,20 @@ async def async_request_openai_completions(

                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += text or ""
                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.output_tokens = usage.get("completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
                            "Never received a valid chunk to calculate TTFT."
-                            "This response will be marked as failed!")
+                            "This response will be marked as failed!"
+                        )
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
                else:
@ -337,23 +354,22 @@ async def async_request_openai_chat_completions(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("chat/completions", "profile")
-    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    )

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
            "messages": [
-                {
-                    "role": "user",
-                    "content": content
-                },
+                {"role": "user", "content": content},
            ],
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
@ -379,16 +395,16 @@ async def async_request_openai_chat_completions(
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)
@ -402,13 +418,11 @@ async def async_request_openai_chat_completions(

                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)

                                generated_text += content or ""
                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.output_tokens = usage.get("completion_tokens")

                            most_recent_timestamp = timestamp

@ -428,8 +442,115 @@ async def async_request_openai_chat_completions(
    return output


+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    )
+    "or `translations`."
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(
+                    url=api_url, data=form, headers=headers
+                ) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get("content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens"
+                                    )
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
 def get_model(pretrained_model_name_or_path: str) -> str:
-    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
+    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
        from modelscope import snapshot_download

        from vllm.model_executor.model_loader.weight_utils import get_lock
@ -440,7 +561,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
            model_path = snapshot_download(
                model_id=pretrained_model_name_or_path,
                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+            )

            return model_path
    return pretrained_model_name_or_path
@ -453,23 +575,23 @@ def get_tokenizer(
    **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    if pretrained_model_name_or_path is not None and not os.path.exists(
-            pretrained_model_name_or_path):
-        pretrained_model_name_or_path = get_model(
-            pretrained_model_name_or_path)
+        pretrained_model_name_or_path
+    ):
+        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
    if tokenizer_mode == "slow":
        if kwargs.get("use_fast", False):
-            raise ValueError(
-                "Cannot use the fast tokenizer in slow tokenizer mode.")
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
            from vllm.transformers_utils.tokenizer import MistralTokenizer
        except ImportError as e:
-            raise ImportError("MistralTokenizer requires vllm package.\n"
-                              "Please install it with `pip install vllm` "
-                              "to use mistral tokenizer mode.") from e
-        return MistralTokenizer.from_pretrained(
-            str(pretrained_model_name_or_path))
+            raise ImportError(
+                "MistralTokenizer requires vllm package.\n"
+                "Please install it with `pip install vllm` "
+                "to use mistral tokenizer mode."
+            ) from e
+        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
    else:
        return AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path,
@ -485,7 +607,14 @@ ASYNC_REQUEST_FUNCS = {
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k
+    for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions, async_request_openai_chat_completions)
+]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -23,7 +23,8 @@ from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
-from typing import Any, Optional, Union
+from io import BytesIO
+from typing import Any, Callable, Optional, Union

 import numpy as np
 import pandas as pd
@ -34,6 +35,7 @@ from transformers import PreTrainedTokenizerBase
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer

 logger = logging.getLogger(__name__)
@ -63,6 +65,7 @@ class SampleRequest:

 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0
+    IS_MULTIMODAL = False

    def __init__(
        self,
@ -80,14 +83,12 @@ class BenchmarkDataset(ABC):
        self.dataset_path = dataset_path
        # Set the random seed, ensuring that a None value is replaced with the
        # default seed.
-        self.random_seed = (random_seed
-                            if random_seed is not None else self.DEFAULT_SEED)
+        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
        self.data = None

    def apply_multimodal_chat_transformation(
-            self,
-            prompt: str,
-            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+        self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
+    ) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
        This method is used for chat models that expect a specific conversation
@ -109,8 +110,7 @@ class BenchmarkDataset(ABC):
            NotImplementedError: If a subclass does not implement this method.
        """
        # TODO (jenniferzhao): add support for downloading data
-        raise NotImplementedError(
-            "load_data must be implemented in subclasses.")
+        raise NotImplementedError("load_data must be implemented in subclasses.")

    def get_random_lora_request(
        self,
@ -156,8 +156,9 @@ class BenchmarkDataset(ABC):
        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer

    @abstractmethod
-    def sample(self, tokenizer: PreTrainedTokenizerBase,
-               num_requests: int) -> list[SampleRequest]:
+    def sample(
+        self, tokenizer: PreTrainedTokenizerBase, num_requests: int
+    ) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.

@ -175,8 +176,9 @@ class BenchmarkDataset(ABC):
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

-    def maybe_oversample_requests(self, requests: list[SampleRequest],
-                                  num_requests: int) -> None:
+    def maybe_oversample_requests(
+        self, requests: list[SampleRequest], num_requests: int
+    ) -> None:
        """
        Oversamples the list of requests if its size is less than the desired
        number.
@ -187,11 +189,9 @@ class BenchmarkDataset(ABC):
        """
        if len(requests) < num_requests:
            random.seed(self.random_seed)
-            additional = random.choices(requests,
-                                        k=num_requests - len(requests))
+            additional = random.choices(requests, k=num_requests - len(requests))
            requests.extend(additional)
-            logger.info("Oversampled requests to reach %d total samples.",
-                        num_requests)
+            logger.info("Oversampled requests to reach %d total samples.", num_requests)


 # -----------------------------------------------------------------------------
@ -216,14 +216,14 @@ def is_valid_sequence(
    """
    # Check for invalid conditions
    prompt_too_short = prompt_len < min_len
-    output_too_short = (not skip_min_output_len_check) and (output_len
-                                                            < min_len)
+    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
    prompt_too_long = prompt_len > max_prompt_len
    combined_too_long = (prompt_len + output_len) > max_total_len

    # Return True if none of the invalid conditions are met
-    return not (prompt_too_short or output_too_short or prompt_too_long
-                or combined_too_long)
+    return not (
+        prompt_too_short or output_too_short or prompt_too_long or combined_too_long
+    )


@cache
@ -239,41 +239,44 @@ def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.

-    For a PIL.Image.Image input:
-      - Converts the image to RGB.
-      - Saves the image as a JPEG in-memory.
-      - Encodes the JPEG data as a base64 string.
-      - Returns a dictionary with the image as a base64 data URL.
+    Supports three input types:

-    For a string input:
-      - Treats the string as a URL or file path.
-      - Prepends "file://" if the string doesn't start with "http://" or
-        "file://".
-      - Returns a dictionary with the image URL.
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.

    Raises:
-      ValueError: If the input is neither a PIL.Image.Image nor a string.
+        ValueError: If the input is not a supported type.
    """
+    if isinstance(image, dict) and "bytes" in image:
+        image = Image.open(BytesIO(image["bytes"]))
    if isinstance(image, Image.Image):
-        image = image.convert("RGB")
+        image = convert_image_mode(image, "RGB")
        with io.BytesIO() as image_data:
            image.save(image_data, format="JPEG")
-            image_base64 = base64.b64encode(
-                image_data.getvalue()).decode("utf-8")
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
        return {
            "type": "image_url",
-            "image_url": {
-                "url": f"data:image/jpeg;base64,{image_base64}"
-            },
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
        }

    if isinstance(image, str):
-        image_url = (image if image.startswith(
-            ("http://", "file://")) else f"file://{image}")
+        image_url = (
+            image if image.startswith(("http://", "file://")) else f"file://{image}"
+        )
        return {"type": "image_url", "image_url": {"url": image_url}}

    raise ValueError(
-        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
+        f"Invalid image input {image}. Must be a PIL.Image.Image"
+        " or str or dictionary with raw image bytes."
+    )


 # -----------------------------------------------------------------------------
@ -284,7 +287,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
 class RandomDataset(BenchmarkDataset):
    # Default values copied from benchmark_serving.py for the random dataset.
    DEFAULT_PREFIX_LEN = 0
-    DEFAULT_RANGE_RATIO = 1.0
+    DEFAULT_RANGE_RATIO = 0.0
    DEFAULT_INPUT_LEN = 1024
    DEFAULT_OUTPUT_LEN = 128

@ -304,35 +307,62 @@ class RandomDataset(BenchmarkDataset):
        output_len: int = DEFAULT_OUTPUT_LEN,
        **kwargs,
    ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
        vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens

-        prefix_token_ids = (np.random.randint(
-            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+        prefix_token_ids = (
+            np.random.randint(0, vocab_size, size=prefix_len).tolist()
+            if prefix_len > 0
+            else []
+        )

-        input_low = int(input_len * range_ratio)
-        output_low = int(output_len * range_ratio)
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(real_input_len * (1 - range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))

-        input_lens = np.random.randint(input_low,
-                                       input_len + 1,
-                                       size=num_requests)
-        output_lens = np.random.randint(output_low,
-                                        output_len + 1,
-                                        size=num_requests)
+        # Add logging for debugging
+        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
+
+        input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
+        output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
        offsets = np.random.randint(0, vocab_size, size=num_requests)

        requests = []
        for i in range(num_requests):
-            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
-                         vocab_size).tolist()
+            inner_seq = (
+                (offsets[i] + i + np.arange(input_lens[i])) % vocab_size
+            ).tolist()
            token_sequence = prefix_token_ids + inner_seq
            prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
+            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
+                : input_lens[i]
+            ]
+            prompt = tokenizer.decode(re_encoded_sequence)
            total_input_len = prefix_len + int(input_lens[i])
            requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=total_input_len,
                    expected_output_len=int(output_lens[i]),
-                ))
+                )
+            )
        return requests


@ -359,7 +389,8 @@ class ShareGPTDataset(BenchmarkDataset):
            self.data = json.load(f)
        # Filter entries with at least two conversation turns.
        self.data = [
-            entry for entry in self.data
+            entry
+            for entry in self.data
            if "conversations" in entry and len(entry["conversations"]) >= 2
        ]
        random.seed(self.random_seed)
@ -385,27 +416,28 @@ class ShareGPTDataset(BenchmarkDataset):
            )

            lora_request, tokenizer = self.get_random_lora_request(
-                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
-            new_output_len = (len(completion_ids)
-                              if output_len is None else output_len)
-            if not is_valid_sequence(prompt_len,
-                                     new_output_len,
-                                     skip_min_output_len_check=output_len
-                                     is not None):
+            new_output_len = len(completion_ids) if output_len is None else output_len
+            if not is_valid_sequence(
+                prompt_len,
+                new_output_len,
+                skip_min_output_len_check=output_len is not None,
+            ):
                continue
            if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
-                ))
+                )
+            )
        self.maybe_oversample_requests(samples, num_requests)
        return samples

@ -451,42 +483,45 @@ class SonnetDataset(BenchmarkDataset):
    ) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
-        avg_len = sum(len(tokens)
-                      for tokens in tokenized_lines) / len(tokenized_lines)
+        avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)

        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
        base_msg = [{"role": "user", "content": base_prompt}]
-        base_fmt = tokenizer.apply_chat_template(base_msg,
-                                                 add_generation_prompt=True,
-                                                 tokenize=False)
+        base_fmt = tokenizer.apply_chat_template(
+            base_msg, add_generation_prompt=True, tokenize=False
+        )
        base_offset = len(tokenizer(base_fmt).input_ids)
        if input_len <= base_offset:
            raise ValueError(
                f"'input_len' must be higher than the base prompt length "
-                f"({base_offset}).")
+                f"({base_offset})."
+            )

        # Determine how many poem lines to use.
        num_input_lines = round((input_len - base_offset) / avg_len)
-        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
        prefix_lines = self.data[:num_prefix_lines]

        samples = []
-        for _ in range(num_requests):
-            extra_lines = random.choices(self.data,
-                                         k=num_input_lines - num_prefix_lines)
+        while len(samples) < num_requests:
+            extra_lines = random.choices(
+                self.data, k=num_input_lines - num_prefix_lines
+            )
            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
            msg = [{"role": "user", "content": prompt}]
            prompt_formatted = tokenizer.apply_chat_template(
-                msg, add_generation_prompt=True, tokenize=False)
+                msg, add_generation_prompt=True, tokenize=False
+            )
            prompt_len = len(tokenizer(prompt_formatted).input_ids)
-            samples.append(
-                SampleRequest(
-                    prompt=prompt_formatted
-                    if return_prompt_formatted else prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                ))
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    )
+                )
        return samples


@ -506,7 +541,9 @@ class BurstGPTDataset(BenchmarkDataset):
        super().__init__(**kwargs)
        self.load_data()

-    def load_data(self, ):
+    def load_data(
+        self,
+    ):
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")

@ -520,8 +557,7 @@ class BurstGPTDataset(BenchmarkDataset):

    def _sample_loaded_data(self, num_requests: int) -> list:
        if num_requests <= len(self.data):
-            data = self.data.sample(n=num_requests,
-                                    random_state=self.random_seed)
+            data = self.data.sample(n=num_requests, random_state=self.random_seed)
        else:
            data = self.data.sample(
                n=num_requests,
@ -545,7 +581,8 @@ class BurstGPTDataset(BenchmarkDataset):
            input_len = int(data[i][2])
            output_len = int(data[i][3])
            lora_req, tokenizer = self.get_random_lora_request(
-                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
            vocab_size = tokenizer.vocab_size
            # Generate a synthetic prompt: a list of token IDs computed as (i +
            # j) modulo vocab_size.
@ -557,64 +594,71 @@ class BurstGPTDataset(BenchmarkDataset):
                    prompt_len=input_len,
                    expected_output_len=output_len,
                    lora_request=lora_req,
-                ))
+                )
+            )
        return samples


 # -----------------------------------------------------------------------------
-# HuggingFace Dataset Implementation
+# HuggingFace Dataset Base Implementation
 # -----------------------------------------------------------------------------
-
-
 class HuggingFaceDataset(BenchmarkDataset):
-    """
-    Dataset class for processing a HuggingFace dataset with conversation data
-    and optional images.
-    """
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()

    def __init__(
        self,
+        dataset_path: str,
        dataset_split: str,
        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
-        super().__init__(**kwargs)
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
-
        self.load_data()

    def load_data(self) -> None:
-        if not self.dataset_path:
-            raise ValueError("dataset_path must be provided for loading data.")
-
+        """Load data from HuggingFace datasets."""
        self.data = load_dataset(
            self.dataset_path,
            name=self.dataset_subset,
            split=self.dataset_split,
            streaming=True,
        )
-        if self.data.features is None or "conversations" \
-            not in self.data.features:
-            raise ValueError(
-                "HuggingFaceDataset currently only supports datasets with "
-                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
-                "Please consider contributing if you would like to add "
-                "support for additional dataset formats.")
-        # Shuffle and filter examples with at least 2 conversations.
-        self.data = self.data.shuffle(seed=self.random_seed).filter(
-            lambda x: len(x["conversations"]) >= 2)
+        self.data = self.data.shuffle(seed=self.random_seed)

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+
+    SUPPORTED_DATASET_PATHS = {
+        "lmms-lab/LLaVA-OneVision-Data",
+        "Aeala/ShareGPT_Vicuna_unfiltered",
+    }
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
        sampled_requests = []
        dynamic_output = output_len is None

-        for item in self.data:
+        for item in filtered_data:
            if len(sampled_requests) >= num_requests:
                break
            conv = item["conversations"]
@ -626,24 +670,22 @@ class HuggingFaceDataset(BenchmarkDataset):
            completion_len = len(completion_ids)
            output_len = completion_len if dynamic_output else output_len
            assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(
-                    prompt_len, completion_len):
+            if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
                continue
-            mm_content = process_image(
-                item["image"]) if "image" in item else None
+            mm_content = process_image(item["image"]) if "image" in item else None
            if enable_multimodal_chat:
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len and output len
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, mm_content)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                ))
+                )
+            )
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests

@ -659,29 +701,11 @@ class VisionArenaDataset(HuggingFaceDataset):
    """

    DEFAULT_OUTPUT_LEN = 128
-    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
-            raise ValueError(f"Only support Vision Arena dataset.\
-                    This data path {self.dataset_path} is not valid.")
-        if self.dataset_subset is None and self.dataset_split != "train":
-            raise ValueError("Dataset split must be 'train'.")
-
-        self.load_data()
-
-    def load_data(self) -> None:
-        dataset = load_dataset(
-            self.dataset_path,
-            name=self.dataset_subset,
-            split=self.dataset_split,
-            streaming=True,
-        )
-        self.data = dataset.shuffle(seed=self.random_seed)
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
+    }
+    IS_MULTIMODAL = True

    def sample(
        self,
@ -691,27 +715,356 @@ class VisionArenaDataset(HuggingFaceDataset):
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
        sampled_requests = []
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            prompt = item["turns"][0][0]["content"]
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
            mm_content = process_image(item["images"][0])
            prompt_len = len(tokenizer(prompt).input_ids)
            if enable_multimodal_chat:
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, mm_content)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                ))
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench.
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """  # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime",
+        "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item["problem"], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
+            ):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Next Edit Prediction Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+zeta_prompt = """### Instruction:
+You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
+
+### User Edits:
+
+{}
+
+### User Excerpt:
+
+{}
+
+### Response:
+
+"""  # noqa: E501
+
+
+def _format_zeta_prompt(
+    sample: dict, original_start_marker: str = "<|editable_region_start|>"
+) -> dict:
+    """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
+
+    This function formats examples from the NEP dataset
+    into prompts and expected outputs. It could be
+    further extended to support more NEP datasets.
+
+    Args:
+        sample: The dataset sample containing events,
+            inputs, and outputs.
+        original_start_marker: The marker indicating the
+            start of the editable region. Defaults to
+            "<|editable_region_start|>".
+
+    Returns:
+        A dictionary with the formatted prompts and expected outputs.
+    """
+    events = sample["events"]
+    input = sample["input"]
+    output = sample["output"]
+    prompt = zeta_prompt.format(events, input)
+
+    # following the original implementation, extract the focused region
+    # from the raw output
+    output_start_index = output.find(original_start_marker)
+    output_focused_region = output[output_start_index:]
+    expected_output = output_focused_region
+
+    return {"prompt": prompt, "expected_output": expected_output}
+
+
+class NextEditPredictionDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a Next Edit Prediction dataset.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "zed-industries/zeta",
+    }
+    MAPPING_PROMPT_FUNCS = {
+        "zed-industries/zeta": _format_zeta_prompt,
+    }
+
+    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
+        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
+        if formatting_prompt_func is None:
+            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+        samples = []
+        for sample in self.data:
+            sample = formatting_prompt_func(sample)
+            samples.append(
+                SampleRequest(
+                    prompt=sample["prompt"],
+                    prompt_len=len(tokenizer(sample["prompt"]).input_ids),
+                    expected_output_len=len(
+                        tokenizer(sample["expected_output"]).input_ids
+                    ),
+                )
+            )
+            if len(samples) >= num_requests:
+                break
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """  # noqa: E501
+
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr",
+        "facebook/voxpopuli",
+        "LIUM/tedlium",
+        "edinburghcstr/ami",
+        "speechcolab/gigaspeech",
+        "kensho/spgispeech",
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        import librosa
+
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                )
+            )
+        if skipped:
+            logger.warning(
+                "%d samples discarded from dataset due to"
+                " their length being greater than"
+                " what Whisper supports.",
+                skipped,
+            )
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -11,9 +11,9 @@ from typing import Any, Optional

 import numpy as np
 import torch
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm

+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
@ -21,13 +21,14 @@ from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser


-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any]) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={"latency": results["latencies"]},
-        extra_info={k: results[k]
-                    for k in ["avg_latency", "percentiles"]})
+        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
+    )
    if pt_records:
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
        write_to_json(pt_file, pt_records)
@ -42,9 +43,11 @@ def main(args: argparse.Namespace):
    # the engine will automatically process the request in multiple batches.
    llm = LLM(**dataclasses.asdict(engine_args))
    assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len +
-        args.output_len), ("Please ensure that max_model_len is greater than"
-                           " the sum of input_len and output_len.")
+        args.input_len + args.output_len
+    ), (
+        "Please ensure that max_model_len is greater than"
+        " the sum of input_len and output_len."
+    )

    sampling_params = SamplingParams(
        n=args.n,
@ -55,18 +58,16 @@ def main(args: argparse.Namespace):
        detokenize=not args.disable_detokenize,
    )
    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.batch_size,
-                                                     args.input_len))
-    dummy_prompts: list[PromptType] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]

    def llm_generate():
        if not args.use_beam_search:
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
        else:
            llm.beam_search(
                dummy_prompts,
@ -80,12 +81,13 @@ def main(args: argparse.Namespace):
    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir)),
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    str(profile_dir)
+                ),
            ) as p:
                llm_generate()
            print(p.key_averages().table(sort_by="self_cuda_time_total"))
@ -103,8 +105,9 @@ def main(args: argparse.Namespace):
    if args.profile:
        profile_dir = args.profile_result_dir
        if not profile_dir:
-            profile_dir = (Path(".") / "vllm_benchmark_result" /
-                           f"latency_result_{time.time()}")
+            profile_dir = (
+                Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+            )
        print(f"Profiling (results will be saved to '{profile_dir}')...")
        run_to_completion(profile_dir=profile_dir)
        return
@ -135,7 +138,8 @@ def main(args: argparse.Namespace):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description="Benchmark the latency of processing a single batch of "
-        "requests till completion.")
+        "requests till completion."
+    )
    parser.add_argument("--input-len", type=int, default=32)
    parser.add_argument("--output-len", type=int, default=128)
    parser.add_argument("--batch-size", type=int, default=8)
@ -152,10 +156,9 @@ if __name__ == "__main__":
        default=10,
        help="Number of iterations to run for warmup.",
    )
-    parser.add_argument("--num-iters",
-                        type=int,
-                        default=30,
-                        help="Number of iterations to run.")
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
    parser.add_argument(
        "--profile",
        action="store_true",
@ -165,8 +168,10 @@ if __name__ == "__main__":
        "--profile-result-dir",
        type=str,
        default=None,
-        help=("path to save the pytorch profiler output. Can be visualized "
-              "with ui.perfetto.dev or Tensorboard."),
+        help=(
+            "path to save the pytorch profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard."
+        ),
    )
    parser.add_argument(
        "--output-json",
@ -177,10 +182,15 @@ if __name__ == "__main__":
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
-        help=("Do not detokenize responses (i.e. do not include "
-              "detokenization time in the latency measurement)"),
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
    )

    parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -86,20 +86,21 @@ def repeat_prompts(prompts, repeat_count, mode: str):
        ValueError: If an invalid mode is provided.
    """
    print("Repeat mode: ", mode)
-    if mode == 'random':
+    if mode == "random":
        repeated_prompts = prompts * repeat_count
        random.shuffle(repeated_prompts)
        return repeated_prompts
-    elif mode == 'tile':
+    elif mode == "tile":
        return prompts * repeat_count
-    elif mode == 'interleave':
+    elif mode == "interleave":
        repeated_prompts = []
        for prompt in prompts:
            repeated_prompts.extend([prompt] * repeat_count)
        return repeated_prompts
    else:
-        raise ValueError(f"Invalid mode: {mode}, only support "
-                         "'random', 'tile', 'interleave'")
+        raise ValueError(
+            f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
+        )


 def main(args):
@ -109,16 +110,16 @@ def main(args):
    # we append the document id at the beginning to avoid any of the document
    # being the prefix of other documents
    prompts = [
-        str(i) + ' '.join(['hi'] * args.document_length)
+        str(i) + " ".join(["hi"] * args.document_length)
        for i in range(args.num_documents)
    ]

    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)

    warmup_prompts = [
-        "This is warm up request " + str(i) + \
-                ' '.join(['hi'] * args.document_length)
-        for i in range(args.num_documents)]
+        "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
+        for i in range(args.num_documents)
+    ]

    # Create the LLM engine
    engine_args = EngineArgs.from_cli_args(args)
@ -142,42 +143,52 @@ def main(args):

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description=
-        'Benchmark the performance with or without automatic prefix caching.')
+        description="Benchmark the performance with or "
+        "without automatic prefix caching."
+    )

    parser.add_argument(
-        '--document-length',
+        "--document-length",
        type=int,
        # Roughly the number of tokens for a system paper,
        # excluding images
        default=20000,
-        help='Range of input lengths for sampling prompts,'
-        'specified as "min:max" (e.g., "128:256").')
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )

-    parser.add_argument('--num-documents',
-                        type=int,
-                        default=8,
-                        help='Range of input lengths for sampling prompts,'
-                        'specified as "min:max" (e.g., "128:256").')
+    parser.add_argument(
+        "--num-documents",
+        type=int,
+        default=8,
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )

-    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument("--output-len", type=int, default=10)

-    parser.add_argument('--repeat-count',
-                        type=int,
-                        default=2,
-                        help='Number of times to repeat each prompt')
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=2,
+        help="Number of times to repeat each prompt",
+    )

-    parser.add_argument("--repeat-mode",
-                        type=str,
-                        default='random',
-                        help='The mode to repeat prompts. The supported '
-                        'modes are "random", "tile", and "interleave". '
-                        'See repeat_prompts() in the source code for details.')
+    parser.add_argument(
+        "--repeat-mode",
+        type=str,
+        default="random",
+        help="The mode to repeat prompts. The supported "
+        'modes are "random", "tile", and "interleave". '
+        "See repeat_prompts() in the source code for details.",
+    )

-    parser.add_argument("--shuffle-seed",
-                        type=int,
-                        default=0,
-                        help='Random seed when the repeat mode is "random"')
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=0,
+        help='Random seed when the repeat mode is "random"',
+    )

    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -63,14 +63,15 @@ class Request:
    output_len: int


-def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
    vocab = tokenizer.get_vocab()
+    all_special_ids = set(tokenizer.all_special_ids)
+
    # Remove the special tokens.
-    vocab = {
-        k: v
-        for k, v in vocab.items() if k not in tokenizer.all_special_ids
-    }
-    return random.choices(list(vocab.values()), k=length)
+    return random.choices(
+        [v for k, v in vocab.items() if k not in all_special_ids],
+        k=length,
+    )


 def sample_requests_from_dataset(
@ -89,8 +90,10 @@ def sample_requests_from_dataset(
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]

    # Shuffle the dataset.
    random.shuffle(dataset)
@ -111,8 +114,9 @@ def sample_requests_from_dataset(
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
-        output_len = (len(completion_token_ids)
-                      if fixed_output_len is None else fixed_output_len)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
        if min_len <= prompt_len <= max_len:
            filtered_requests.append(Request(prompt, prompt_len, output_len))

@ -126,27 +130,27 @@ def sample_requests_from_random(
    fixed_output_len: Optional[int],
    prefix_len: int,
 ) -> list[Request]:
-
    requests = []
    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
    min_len, max_len = input_length_range

    for i in range(num_requests):
        unique_part_token_ids = sample_tokens(
-            tokenizer,
-            random.randint(min_len - prefix_len, max_len - prefix_len))
+            tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
+        )
        prompt_token_ids = prefix_token_ids + unique_part_token_ids
        prompt = tokenizer.decode(prompt_token_ids)
        prompt_len = len(prompt_token_ids)
-        assert (min_len <= prompt_len <= max_len
-                ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        assert min_len <= prompt_len <= max_len, (
+            f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        )
        requests.append(Request(prompt, prompt_len, fixed_output_len))
    return requests


-def repeat_and_sort_requests(requests: list[Request],
-                             repeat_count: int,
-                             sort: bool = False) -> list[str]:
+def repeat_and_sort_requests(
+    requests: list[Request], repeat_count: int, sort: bool = False
+) -> list[str]:
    repeated_requests = requests * repeat_count
    if sort:
        repeated_requests.sort(key=lambda x: x[1])
@ -157,14 +161,14 @@ def repeat_and_sort_requests(requests: list[Request],

 def main(args):
    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
-    input_length_range = tuple(map(int, args.input_length_range.split(':')))
+    input_length_range = tuple(map(int, args.input_length_range.split(":")))
    random.seed(args.seed)
    if args.dataset_path is not None:
        if args.prefix_len > 0:
-            raise ValueError("prefix-len is not supported when "
-                             "dataset-path is provided.")
-        print(f"Start to sample {args.num_prompts} prompts "
-              f"from {args.dataset_path}")
+            raise ValueError(
+                "prefix-len is not supported when dataset-path is provided."
+            )
+        print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
        filtered_requests = sample_requests_from_dataset(
            dataset_path=args.dataset_path,
            num_requests=args.num_prompts,
@ -194,14 +198,16 @@ def main(args):

    llm = LLM(**dataclasses.asdict(engine_args))

-    sampling_params = SamplingParams(temperature=0,
-                                     max_tokens=args.output_len,
-                                     detokenize=not args.disable_detokenize)
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )

    print("Testing filtered requests")
-    prompts = repeat_and_sort_requests(filtered_requests,
-                                       repeat_count=args.repeat_count,
-                                       sort=args.sort)
+    prompts = repeat_and_sort_requests(
+        filtered_requests, repeat_count=args.repeat_count, sort=args.sort
+    )

    print("------start generating------")
    test_prefix(
@ -213,29 +219,35 @@ def main(args):

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description=
-        'Benchmark the performance with or without automatic prefix caching.')
-    parser.add_argument("--dataset-path",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset.")
-    parser.add_argument('--output-len', type=int, default=10)
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        required=True,
-                        help="Number of the prompts sampled from dataset")
-    parser.add_argument('--repeat-count',
-                        type=int,
-                        default=1,
-                        help='Number of times to repeat each prompt')
-    parser.add_argument('--sort',
-                        action='store_true',
-                        help='Sort prompts by input length')
-    parser.add_argument('--input-length-range',
-                        type=str,
-                        required=True,
-                        help='Range of input lengths for sampling prompts,'
-                        'specified as "min:max" (e.g., "128:256").')
+        description="Benchmark the performance with or without "
+        "automatic prefix caching."
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        required=True,
+        help="Number of the prompts sampled from dataset",
+    )
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=1,
+        help="Number of times to repeat each prompt",
+    )
+    parser.add_argument(
+        "--sort", action="store_true", help="Sort prompts by input length"
+    )
+    parser.add_argument(
+        "--input-length-range",
+        type=str,
+        required=True,
+        help="Range of input lengths for sampling prompts,"
+        'specified as "min:max" (e.g., "128:256").',
+    )
    parser.add_argument(
        "--prefix-len",
        type=int,
@ -246,10 +258,12 @@ if __name__ == "__main__":
        "when dataset-path is not provided.",
    )
    parser.add_argument(
-        '--disable-detokenize',
-        action='store_true',
-        help=("Do not detokenize responses (i.e. do not include "
-              "detokenization time in the latency measurement)"),
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
    )

    parser = EngineArgs.add_cli_args(parser)
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark offline prioritization."""
+
 import argparse
 import dataclasses
 import json
@ -13,7 +14,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser


-#Select a equi-probable random priority
+# Select a equi-probable random priority
 def get_random_flag():
    return 0 if random.random() < 0.5 else 1

@ -33,8 +34,10 @@ def sample_requests(
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]

    # Shuffle the dataset.
    random.shuffle(dataset)
@ -51,8 +54,9 @@ def sample_requests(
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
        if prompt_len < 4 or output_len < 4:
            # Prune too short sequences.
            continue
@ -74,13 +78,16 @@ def run_vllm(
    disable_detokenize: bool = False,
 ) -> float:
    from vllm import LLM, SamplingParams
+
    llm = LLM(**dataclasses.asdict(engine_args))

    assert all(
        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " input_len and output_len for all requests.")
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " input_len and output_len for all requests."
+    )

    # Add the requests to the engine.
    prompts = []
@ -97,7 +104,8 @@ def run_vllm(
                ignore_eos=True,
                max_tokens=output_len,
                detokenize=not disable_detokenize,
-            ))
+            )
+        )

    start = time.perf_counter()
    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
@ -111,26 +119,33 @@ def main(args: argparse.Namespace):

    # Sample the requests.
    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
        prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len,
-                     get_random_flag()) for _ in range(args.num_prompts)]
+        requests = [
+            (prompt, args.input_len, args.output_len, get_random_flag())
+            for _ in range(args.num_prompts)
+        ]
    else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
+        requests = sample_requests(
+            args.dataset, args.num_prompts, tokenizer, args.output_len
+        )

    if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.n,
-                                EngineArgs.from_cli_args(args),
-                                args.disable_detokenize)
+        elapsed_time = run_vllm(
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
+        )
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len, priority in requests)
-    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+    total_num_tokens = sum(
+        prompt_len + output_len for _, prompt_len, output_len, priority in requests
+    )
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} tokens/s"
+    )

    # Output JSON results if specified
    if args.output_json:
@ -147,41 +162,44 @@ def main(args: argparse.Namespace):

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
-    parser.add_argument("--backend",
-                        type=str,
-                        choices=["vllm", "hf", "mii"],
-                        default="vllm")
-    parser.add_argument("--dataset",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset.")
-    parser.add_argument("--input-len",
-                        type=int,
-                        default=None,
-                        help="Input prompt length for each request")
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=None,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=200,
-                        help="Number of prompts to process.")
    parser.add_argument(
-        '--output-json',
+        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=200, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--output-json",
        type=str,
        default=None,
-        help='Path to save the throughput results in JSON format.')
+        help="Path to save the throughput results in JSON format.",
+    )
    parser.add_argument(
-        '--disable-detokenize',
-        action='store_true',
-        help=("Do not detokenize responses (i.e. do not include "
-              "detokenization time in the latency measurement)"),
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
    )

    parser = EngineArgs.add_cli_args(parser)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -5,16 +5,13 @@ On the server side, run one of the following commands:
    (vLLM OpenAI API server)
    vllm serve <your_model> --disable-log-requests

-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-
 On the client side, run:
    python benchmarks/benchmark_serving_structured_output.py \
        --backend <backend> \
        --model <your_model> \
        --dataset json \
        --structured-output-ratio 1.0 \
-        --structured-output-backend xgrammar \
+        --structured-output-backend auto \
        --request-rate 10 \
        --num-prompts 1000

@ -22,6 +19,7 @@ On the client side, run:
        --endpoint /generate_stream
    to the end of the command above.
 """
+
 import argparse
 import asyncio
 import copy
@ -39,11 +37,15 @@ from typing import Optional
 import datasets
 import numpy as np
 import pandas as pd
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
-                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
@ -54,8 +56,9 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

-from vllm.v1.structured_output.utils import (
-    has_xgrammar_unsupported_json_features)
+from vllm.v1.structured_output.backend_xgrammar import (
+    has_xgrammar_unsupported_json_features,
+)

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000

@ -101,6 +104,7 @@ class SampleRequest:
        prompt_len: The length of the prompt in tokens.
        expected_output_len: The expected length of the output in tokens.
    """
+
    prompt: str
    prompt_len: int
    expected_output_len: int
@ -109,60 +113,61 @@ class SampleRequest:
    completion: str = None


-def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> list[SampleRequest]:
-    if args.dataset == 'json' or args.dataset == 'json-unique':
+def sample_requests(
+    tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
+) -> list[SampleRequest]:
+    if args.dataset == "json" or args.dataset == "json-unique":
        if args.json_schema_path is None:
            dir_path = os.path.dirname(os.path.realpath(__file__))
-            args.json_schema_path = os.path.join(dir_path,
-                                                 "structured_schemas",
-                                                 "structured_schema_1.json")
+            args.json_schema_path = os.path.join(
+                dir_path, "structured_schemas", "structured_schema_1.json"
+            )
        json_schemas = []
        with open(args.json_schema_path) as f:
            schema = json.load(f)

-        if args.dataset == 'json-unique':
-            json_schemas = [
-                copy.deepcopy(schema) for _ in range(args.num_prompts)
-            ]
+        if args.dataset == "json-unique":
+            json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)]
            for i in range(len(json_schemas)):
-                json_schemas[i]["properties"][
-                    f"__optional_field_{uuid.uuid4()}"] = {
-                        "type":
-                        "string",
-                        "description":
-                        "An unique optional field to avoid cached schemas"
-                    }
+                if "properties" not in json_schemas[i]:
+                    json_schemas[i]["properties"] = {}
+                json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = {
+                    "type": "string",
+                    "description": "An unique optional field to avoid cached schemas",
+                }
+        else:
+            json_schemas = [schema] * args.num_prompts

        def gen_prompt(index: int):
-            schema = json_schemas[index % len(json_schemas)]
-            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+            return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501

        def get_schema(index: int):
            return json_schemas[index % len(json_schemas)]

        requests = [
-            SampleRequest(prompt=gen_prompt(i),
-                          prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
-                          expected_output_len=args.output_len,
-                          schema=get_schema(i),
-                          structure_type=args.structure_type)
+            SampleRequest(
+                prompt=gen_prompt(i),
+                prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
+                expected_output_len=args.output_len,
+                schema=get_schema(i),
+                structure_type=args.structure_type,
+            )
            for i in range(args.num_prompts)
        ]

    elif args.dataset == "grammar":
        schema = """
-            ?start: select_statement
+        root ::= select_statement

-            ?select_statement: "SELECT " column_list " FROM " table_name
+        select_statement ::= "SELECT " column " from " table " where " condition

-            ?column_list: column_name ("," column_name)*
+        column ::= "col_1 " | "col_2 "

-            ?table_name: identifier
+        table ::= "table_1 " | "table_2 "

-            ?column_name: identifier
+        condition ::= column "= " number

-            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        number ::= "1 " | "2 "
        """
        prompt = "Generate an SQL query to show the 'username' \
            and 'email' from the 'users' table."
@ -170,11 +175,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=schema,
-                          structure_type=args.structure_type)
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=schema,
+                structure_type=args.structure_type,
+            )
            for _ in range(args.num_prompts)
        ]

@ -188,11 +195,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=regex,
-                          structure_type=args.structure_type)
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=regex,
+                structure_type=args.structure_type,
+            )
            for _ in range(args.num_prompts)
        ]

@ -203,47 +212,55 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=choice,
-                          structure_type=args.structure_type)
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=choice,
+                structure_type=args.structure_type,
+            )
            for _ in range(args.num_prompts)
        ]

    elif args.dataset == "xgrammar_bench":
        requests: list[SampleRequest] = []
-        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
-                                        split="train")
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train")
        full_dataset_len = len(dataset)

        def _filter_func(item):
            import json
+
            schema = json.loads(item["schema"])
            return not has_xgrammar_unsupported_json_features(schema)

        dataset = dataset.filter(_filter_func)
        num_filtered_out = full_dataset_len - len(dataset)
-        print(f"dataset has {len(dataset)} entries after filtering "
-              f"out {num_filtered_out} entries with unsupported features")
+        print(
+            f"dataset has {len(dataset)} entries after filtering "
+            f"out {num_filtered_out} entries with unsupported features"
+        )
        len_dataset = len(dataset)
        for data_point_idx in range(args.num_prompts):
            idx = data_point_idx
            while idx >= len_dataset:
                idx -= len_dataset
            schema = dataset["schema"][idx]
-            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
-                                                   tokenize=False)
+            prompt = tokenizer.apply_chat_template(
+                dataset["prompt"][idx], tokenize=False, add_generation_prompt=True
+            )
            input_len = len(tokenizer(prompt).input_ids)
            completion = dataset["completion"][idx]

            requests.append(
-                SampleRequest(prompt=prompt,
-                              prompt_len=input_len,
-                              expected_output_len=args.output_len,
-                              schema=schema,
-                              structure_type=args.structure_type,
-                              completion=completion))
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=args.output_len,
+                    schema=schema,
+                    structure_type=args.structure_type,
+                    completion=completion,
+                )
+            )

    return requests

@ -275,7 +292,8 @@ async def get_request(

    # Calculate scale parameter theta to maintain the desired request_rate.
    assert burstiness > 0, (
-        f"A positive burstiness factor is expected, but given {burstiness}.")
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
    theta = 1.0 / (request_rate * burstiness)

    for i, request in enumerate(input_requests):
@ -317,8 +335,8 @@ def calculate_metrics(
            # multiple output tokens may be bundled together
            # Note : this may inflate the output token count slightly
            output_len = len(
-                tokenizer(outputs[i].generated_text,
-                          add_special_tokens=False).input_ids)
+                tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
+            )
            actual_output_lens.append(output_len)
            total_input += input_requests[i].prompt_len
            tpot = 0
@ -342,16 +360,19 @@ def calculate_metrics(

        if "ttft" in goodput_config_dict:
            valid_metrics.append(ttfts)
-            slo_values.append(goodput_config_dict["ttft"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "tpot" in goodput_config_dict:
            valid_metrics.append(all_tpots)
-            slo_values.append(goodput_config_dict["tpot"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "e2el" in goodput_config_dict:
            valid_metrics.append(e2els)
-            slo_values.append(goodput_config_dict["e2el"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )

        for req_metric in zip(*valid_metrics):
            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@ -362,7 +383,8 @@ def calculate_metrics(
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
            "on the benchmark arguments.",
-            stacklevel=2)
+            stacklevel=2,
+        )
    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
@ -371,27 +393,31 @@ def calculate_metrics(
        request_goodput=good_completed / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_ttft_ms=np.mean(ttfts or 0) *
-        1000,  # ttfts is empty if streaming is not supported by backend
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
        std_ttft_ms=np.std(ttfts or 0) * 1000,
        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        std_tpot_ms=np.std(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_itl_ms=np.mean(itls or 0) * 1000,
        std_itl_ms=np.std(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
-                            for p in selected_percentiles],
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
    )

    return metrics, actual_output_lens
@ -413,7 +439,6 @@ async def benchmark(
    ignore_eos: bool,
    max_concurrency: Optional[int],
    structured_output_ratio: float,
-    structured_output_backend: str,
    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
@ -425,18 +450,17 @@ async def benchmark(
        extra_body = {}
        # Add the schema to the extra_body
        extra_body[request.structure_type] = request.schema
-        # Add the specific structured_output_backend
-        extra_body["guided_decoding_backend"] = structured_output_backend
        return extra_body

    print("Starting initial single prompt test run...")
    structured_output_req_idx = random.sample(
-        range(len(input_requests)),
-        int(len(input_requests) * structured_output_ratio))
+        range(len(input_requests)), int(len(input_requests) * structured_output_ratio)
+    )

    test_request = input_requests[0]
-    test_req_extra_body = (prepare_extra_body(test_request)
-                           if 0 in structured_output_req_idx else None)
+    test_req_extra_body = (
+        prepare_extra_body(test_request) if 0 in structured_output_req_idx else None
+    )
    test_input = RequestFuncInput(
        model=model_id,
        prompt=test_request.prompt,
@ -450,7 +474,8 @@ async def benchmark(
    if not test_output.success:
        raise ValueError(
            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}")
+            f"are correctly specified. Error: {test_output.error}"
+        )
    else:
        print("Initial test run completed. Starting main benchmark run...")

@ -469,10 +494,7 @@ async def benchmark(
        if profile_output.success:
            print("Profiler started")

-    if burstiness == 1.0:
-        distribution = "Poisson process"
-    else:
-        distribution = "Gamma distribution"
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"

    print(f"Traffic request rate: {request_rate}")
    print(f"Burstiness factor: {burstiness} ({distribution})")
@ -484,24 +506,21 @@ async def benchmark(
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = (asyncio.Semaphore(max_concurrency)
-                 if max_concurrency else None)
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None

    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
-            return await request_func(request_func_input=request_func_input,
-                                      pbar=pbar)
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
-            return await request_func(request_func_input=request_func_input,
-                                      pbar=pbar)
+            return await request_func(request_func_input=request_func_input, pbar=pbar)

    benchmark_start_time = time.perf_counter()
    tasks: list[asyncio.Task] = []
    expected: list[str] = []
-    async for i, request in get_request(input_requests, request_rate,
-                                        burstiness):
-        extra_body = prepare_extra_body(
-            request) if i in structured_output_req_idx else None
+    async for i, request in get_request(input_requests, request_rate, burstiness):
+        extra_body = (
+            prepare_extra_body(request) if i in structured_output_req_idx else None
+        )
        request_func_input = RequestFuncInput(
            model=model_id,
            prompt=request.prompt,
@ -514,8 +533,9 @@ async def benchmark(
        expected.append(request.completion)
        tasks.append(
            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input,
-                                     pbar=pbar)))
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
+            )
+        )
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)

    if profile:
@ -547,54 +567,58 @@ async def benchmark(
        goodput_config_dict=goodput_config_dict,
    )

-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-                                    benchmark_duration))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
-                                 metrics.total_output))
-    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
-                                    metrics.request_throughput))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
    if goodput_config_dict:
-        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
-                                        metrics.request_goodput))
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                    metrics.output_throughput))
-    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
-                                    metrics.total_token_throughput))
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )

    result = {
-        "duration":
-        benchmark_duration,
-        "completed":
-        metrics.completed,
-        "total_input_tokens":
-        metrics.total_input,
-        "total_output_tokens":
-        metrics.total_output,
-        "request_throughput":
-        metrics.request_throughput,
-        "output_throughput":
-        metrics.output_throughput,
-        "total_token_throughput":
-        metrics.total_token_throughput,
-        "ttft_description":
-        pd.Series([output.ttft for output in outputs]).describe().to_dict(),
-        "tpot_description":
-        pd.Series([output.tpot for output in outputs]).describe().to_dict(),
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "ttft_description": pd.Series([output.ttft for output in outputs])
+        .describe()
+        .to_dict(),
+        "tpot_description": pd.Series([output.tpot for output in outputs])
+        .describe()
+        .to_dict(),
        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens":
-        actual_output_lens,
+        "output_lens": actual_output_lens,
        "ttfts": [output.ttft for output in outputs],
        "itls": [output.itl for output in outputs],
        "errors": [output.error for output in outputs],
    }

-    ret = [{
-        'generated': output.generated_text,
-        'expected': gt
-    } for output, gt in zip(outputs, expected)]
+    ret = [
+        {"generated": output.generated_text, "expected": gt}
+        for output, gt in zip(outputs, expected)
+    ]

    def process_one_metric(
        # E.g., "ttft"
@ -608,29 +632,35 @@ async def benchmark(
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
-        print("{:<40} {:<10.2f}".format(
-            f"Mean {metric_name} (ms):",
-            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
-        print("{:<40} {:<10.2f}".format(
-            f"Median {metric_name} (ms):",
-            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms")
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms")
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms")
-        for p, value in getattr(metrics,
-                                f"percentiles_{metric_attribute_name}_ms"):
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
-                                            value))
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
            result[f"p{p_word}_{metric_attribute_name}_ms"] = value

    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT",
-                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
    process_one_metric("itl", "ITL", "Inter-token Latency")
    process_one_metric("e2el", "E2EL", "End-to-end Latency")

@ -640,13 +670,13 @@ async def benchmark(


 def evaluate(ret, args):
-
    def _eval_correctness_json(expected, actual):
        # extract json string from string using regex
-        import re
-        actual = actual.replace('\n', '').replace(' ', '').strip()
+        import regex as re
+
+        actual = actual.replace("\n", "").replace(" ", "").strip()
        try:
-            actual = re.search(r'\{.*\}', actual).group()
+            actual = re.search(r"\{.*\}", actual).group()
            actual = json.loads(actual)
        except Exception:
            return False
@ -657,29 +687,33 @@ def evaluate(ret, args):
        return actual in args.choice

    def _eval_correctness_regex(expected, actual):
-        import re
+        import regex as re
+
        return re.match(args.regex, actual) is not None

    def _eval_correctness(expected, actual):
-        if args.structure_type == 'guided_json':
+        if args.structure_type == "guided_json":
            return _eval_correctness_json(expected, actual)
-        elif args.structure_type == 'guided_regex':
+        elif args.structure_type == "guided_regex":
            return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == 'guided_choice':
+        elif args.structure_type == "guided_choice":
            return _eval_correctness_choice(expected, actual)
        else:
            return None

    scores = []
    for res in ret:
-        score = _eval_correctness(res['expected'], res['generated'])
-        res['correctness'] = score
+        score = _eval_correctness(res["expected"], res["generated"])
+        res["correctness"] = score
        scores.append(score)

    not_none_scores = [score for score in scores if score is not None]

-    return (sum(not_none_scores) / len(not_none_scores) *
-            100) if len(not_none_scores) > 0 else None
+    return (
+        (sum(not_none_scores) / len(not_none_scores) * 100)
+        if len(not_none_scores) > 0
+        else None
+    )


 def parse_goodput(slo_pairs):
@ -691,9 +725,10 @@ def parse_goodput(slo_pairs):
    except ValueError as err:
        raise argparse.ArgumentTypeError(
            "Invalid format found for service level objectives. "
-            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
            "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds.") from err
+            "number in milliseconds."
+        ) from err
    return goodput_config_dict


@ -707,12 +742,14 @@ def check_goodput_args(args):
                raise ValueError(
                    f"Invalid metric name found, {slo_name}: {slo_val}. "
                    "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. ")
+                    f"{str(VALID_NAMES)}. "
+                )
            if slo_val < 0:
                raise ValueError(
                    f"Invalid value found, {slo_name}: {slo_val}. "
                    "The service level objective value should be "
-                    "non-negative.")
+                    "non-negative."
+                )
    return goodput_config_dict


@ -738,19 +775,19 @@ def main(args: argparse.Namespace):
        tokenizer_mode=args.tokenizer_mode,
    )

-    if args.dataset == 'grammar':
-        args.structure_type = 'guided_grammar'
-    elif args.dataset == 'regex':
-        args.structure_type = 'guided_regex'
-    elif args.dataset == 'choice':
-        args.structure_type = 'guided_choice'
+    if args.dataset == "grammar":
+        args.structure_type = "guided_grammar"
+    elif args.dataset == "regex":
+        args.structure_type = "guided_regex"
+    elif args.dataset == "choice":
+        args.structure_type = "guided_choice"
    else:
-        args.structure_type = 'guided_json'
+        args.structure_type = "guided_json"

    if args.no_structured_output:
        args.structured_output_ratio = 0
    if args.save_results:
-        result_file_name = f'{args.structured_output_ratio}guided'
+        result_file_name = f"{args.structured_output_ratio}guided"
        result_file_name += f"_{backend}"
        result_file_name += f"_{args.request_rate}qps"
        result_file_name += f"_{args.model.split('/')[-1]}"
@ -778,37 +815,29 @@ def main(args: argparse.Namespace):
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[
-                float(p) for p in args.metric_percentiles.split(",")
-            ],
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
            ignore_eos=args.ignore_eos,
            max_concurrency=args.max_concurrency,
            structured_output_ratio=args.structured_output_ratio,
-            structured_output_backend=args.structured_output_backend,
            goodput_config_dict=goodput_config_dict,
-        ))
+        )
+    )

    # Save config and results to json
    score = evaluate(ret, args)
-    print("correct_rate(%)", score, '\n')
+    print("correct_rate(%)", score, "\n")
    if args.save_results:
        results = {
-            "backend":
-            backend,
-            "model_id":
-            model_id,
-            "tokenizer_id":
-            tokenizer_id,
-            "num_prompts":
-            args.num_prompts,
-            "request_rate":
-            args.request_rate if args.request_rate < float("inf") else "inf",
-            "burstiness":
-            args.burstiness,
-            "max_concurrency":
-            args.max_concurrency,
-            "correct_rate(%)":
-            score
+            "backend": backend,
+            "model_id": model_id,
+            "tokenizer_id": tokenizer_id,
+            "num_prompts": args.num_prompts,
+            "request_rate": args.request_rate
+            if args.request_rate < float("inf")
+            else "inf",
+            "burstiness": args.burstiness,
+            "max_concurrency": args.max_concurrency,
+            "correct_rate(%)": score,
        }
        results = {"outputs": ret, **results, **benchmark_result}

@ -817,13 +846,14 @@ def main(args: argparse.Namespace):
            result_file_name = args.result_filename
        if args.result_dir:
            result_file_name = os.path.join(args.result_dir, result_file_name)
-        with open(result_file_name, "w", encoding='utf-8') as outfile:
+        with open(result_file_name, "w", encoding="utf-8") as outfile:
            json.dump(results, outfile, indent=4)


 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description="Benchmark the online serving throughput.")
+        description="Benchmark the online serving throughput."
+    )
    parser.add_argument(
        "--backend",
        type=str,
@ -845,16 +875,14 @@ if __name__ == "__main__":
        default="/v1/completions",
        help="API endpoint.",
    )
-    parser.add_argument("--dataset",
-                        default='json',
-                        choices=[
-                            'json', 'json-unique', 'grammar', 'regex',
-                            'choice', 'xgrammar_bench'
-                        ])
-    parser.add_argument("--json_schema_path",
-                        type=str,
-                        default=None,
-                        help="Path to json schema.")
+    parser.add_argument(
+        "--dataset",
+        default="json",
+        choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"],
+    )
+    parser.add_argument(
+        "--json-schema-path", type=str, default=None, help="Path to json schema."
+    )
    parser.add_argument(
        "--max-concurrency",
        type=int,
@ -866,7 +894,8 @@ if __name__ == "__main__":
        "initiated, this argument will control how many are actually allowed "
        "to execute at a time. This means that when used in combination, the "
        "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.")
+        "if the server is not processing requests fast enough to keep up.",
+    )
    parser.add_argument(
        "--model",
        type=str,
@ -876,15 +905,13 @@ if __name__ == "__main__":
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
@ -961,50 +988,51 @@ if __name__ == "__main__":
        "--ignore-eos",
        action="store_true",
        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
    parser.add_argument(
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
-        "Default value is \"ttft,tpot,itl\".")
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'Default value is "ttft,tpot,itl".',
+    )
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
-        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\". "
-        "Use \"--percentile-metrics\" to select metrics.",
+        help="Comma-separated list of percentiles for selected metrics. "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99". '
+        'Use "--percentile-metrics" to select metrics.',
    )
    parser.add_argument(
        "--goodput",
        nargs="+",
        required=False,
-        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
        "pairs, where the key is a metric name, and the value is in "
-        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
        "separated by spaces. Allowed request level metric names are "
-        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )

-    parser.add_argument("--no-structured-output",
-                        action='store_true',
-                        default=False,
-                        help="Whether to disable JSON decoding or not.")
-    parser.add_argument("--structured-output-ratio",
-                        type=float,
-                        default=1.0,
-                        help="Ratio of Structured Outputs requests")
    parser.add_argument(
-        "--structured-output-backend",
-        type=str,
-        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
-        default="xgrammar",
-        help="Backend to use for structured outputs")
+        "--no-structured-output",
+        action="store_true",
+        default=False,
+        help="Whether to disable JSON decoding or not.",
+    )
+    parser.add_argument(
+        "--structured-output-ratio",
+        type=float,
+        default=1.0,
+        help="Ratio of Structured Outputs requests",
+    )

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark offline inference throughput."""
+
 import argparse
 import dataclasses
 import json
@ -11,17 +12,25 @@ from typing import Any, Optional, Union

 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          PreTrainedTokenizerBase)
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase

+from benchmark_dataset import (
+    AIMODataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    InstructCoderDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
@ -36,23 +45,30 @@ def run_vllm(
    disable_detokenize: bool = False,
 ) -> tuple[float, Optional[list[RequestOutput]]]:
    from vllm import LLM, SamplingParams
+
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " prompt_len and expected_output_len for all requests.")
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " prompt_len and expected_output_len for all requests."
+    )
    # Add the requests to the engine.
    prompts: list[Union[TextPrompt, TokensPrompt]] = []
    sampling_params: list[SamplingParams] = []
    for request in requests:
        prompts.append(
-            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
-                       multi_modal_data=request.multi_modal_data)
-            if "prompt_token_ids" in request.prompt else \
-            TextPrompt(prompt=request.prompt,
-                       multi_modal_data=request.multi_modal_data))
+            TokensPrompt(
+                prompt_token_ids=request.prompt["prompt_token_ids"],
+                multi_modal_data=request.multi_modal_data,
+            )
+            if "prompt_token_ids" in request.prompt
+            else TextPrompt(
+                prompt=request.prompt, multi_modal_data=request.multi_modal_data
+            )
+        )
        sampling_params.append(
            SamplingParams(
                n=n,
@ -61,7 +77,8 @@ def run_vllm(
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
-            ))
+            )
+        )
    lora_requests: Optional[list[LoRARequest]] = None
    if engine_args.enable_lora:
        lora_requests = [request.lora_request for request in requests]
@ -71,10 +88,9 @@ def run_vllm(
    outputs = None
    if not use_beam_search:
        start = time.perf_counter()
-        outputs = llm.generate(prompts,
-                               sampling_params,
-                               lora_request=lora_requests,
-                               use_tqdm=True)
+        outputs = llm.generate(
+            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+        )
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
@ -90,30 +106,35 @@ def run_vllm(
                beam_width=n,
                max_tokens=output_len,
                ignore_eos=True,
-            ))
+            ),
+        )
        end = time.perf_counter()
    return end - start, outputs


 def run_vllm_chat(
-        requests: list[SampleRequest],
-        n: int,
-        engine_args: EngineArgs,
-        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, list[RequestOutput]]:
    """
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
    multimodal models as it properly handles multimodal inputs and chat
    formatting. For non-multimodal models, use run_vllm() instead.
    """
    from vllm import LLM, SamplingParams
+
    llm = LLM(**dataclasses.asdict(engine_args))

    assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of "
-            "prompt_len and expected_output_len for all requests.")
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of "
+        "prompt_len and expected_output_len for all requests."
+    )

    prompts = []
    sampling_params: list[SamplingParams] = []
@ -127,7 +148,8 @@ def run_vllm_chat(
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
-            ))
+            )
+        )
    start = time.perf_counter()
    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
    end = time.perf_counter()
@ -144,13 +166,17 @@ async def run_vllm_async(
    from vllm import SamplingParams

    async with build_async_engine_client_from_engine_args(
-            engine_args, disable_frontend_multiprocessing) as llm:
+        engine_args, disable_frontend_multiprocessing
+    ) as llm:
+        model_config = await llm.get_model_config()
        assert all(
-            llm.model_config.max_model_len >= (request.prompt_len +
-                                               request.expected_output_len)
-            for request in requests), (
-                "Please ensure that max_model_len is greater than the sum of"
-                " prompt_len and expected_output_len for all requests.")
+            model_config.max_model_len
+            >= (request.prompt_len + request.expected_output_len)
+            for request in requests
+        ), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests."
+        )

        # Add the requests to the engine.
        prompts: list[Union[TextPrompt, TokensPrompt]] = []
@ -158,11 +184,15 @@ async def run_vllm_async(
        lora_requests: list[Optional[LoRARequest]] = []
        for request in requests:
            prompts.append(
-                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
-                        multi_modal_data=request.multi_modal_data)
-                if "prompt_token_ids" in request.prompt else \
-                TextPrompt(prompt=request.prompt,
-                           multi_modal_data=request.multi_modal_data))
+                TokensPrompt(
+                    prompt_token_ids=request.prompt["prompt_token_ids"],
+                    multi_modal_data=request.multi_modal_data,
+                )
+                if "prompt_token_ids" in request.prompt
+                else TextPrompt(
+                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
+                )
+            )
            sampling_params.append(
                SamplingParams(
                    n=n,
@ -171,17 +201,16 @@ async def run_vllm_async(
                    ignore_eos=True,
                    max_tokens=request.expected_output_len,
                    detokenize=not disable_detokenize,
-                ))
+                )
+            )
            lora_requests.append(request.lora_request)

        generators = []
        start = time.perf_counter()
-        for i, (prompt, sp,
-                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
-            generator = llm.generate(prompt,
-                                     sp,
-                                     lora_request=lr,
-                                     request_id=f"test{i}")
+        for i, (prompt, sp, lr) in enumerate(
+            zip(prompts, sampling_params, lora_requests)
+        ):
+            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
            generators.append(generator)
        all_gens = merge_async_iterators(*generators)
        async for i, res in all_gens:
@ -200,7 +229,8 @@ def run_hf(
    disable_detokenize: bool = False,
 ) -> float:
    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+    )
    if llm.config.model_type == "llama":
        # To enable padding in the HF backend.
        tokenizer.pad_token = tokenizer.eos_token
@ -212,22 +242,26 @@ def run_hf(
    max_prompt_len = 0
    max_output_len = 0
    for i in range(len(requests)):
-        prompt, prompt_len, output_len = requests[i]
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
        # Add the prompt to the batch.
        batch.append(prompt)
        max_prompt_len = max(max_prompt_len, prompt_len)
        max_output_len = max(max_output_len, output_len)
        if len(batch) < max_batch_size and i != len(requests) - 1:
            # Check if we can add more requests to the batch.
-            _, next_prompt_len, next_output_len = requests[i + 1]
-            if (max(max_prompt_len, next_prompt_len) +
-                    max(max_output_len, next_output_len)) <= 2048:
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
+            if (
+                max(max_prompt_len, next_prompt_len)
+                + max(max_output_len, next_output_len)
+            ) <= 2048:
                # We can add more requests to the batch.
                continue

        # Generate the sequences.
-        input_ids = tokenizer(batch, return_tensors="pt",
-                              padding=True).input_ids
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
        llm_outputs = llm.generate(
            input_ids=input_ids.cuda(),
            do_sample=True,
@ -257,6 +291,7 @@ def run_mii(
    output_len: int,
 ) -> float:
    from mii import client, serve
+
    llm = serve(model, tensor_parallel=tensor_parallel_size)
    prompts = [request.prompt for request in requests]

@ -268,8 +303,9 @@ def run_mii(
    return end - start


-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any]) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={
@ -277,9 +313,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
            "tokens_per_second": [results["tokens_per_second"]],
        },
        extra_info={
-            k: results[k]
-            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
-        })
+            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        },
+    )
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
@ -300,6 +336,7 @@ def get_requests(args, tokenizer):
        "input_len": args.input_len,
        "output_len": args.output_len,
    }
+
    if args.dataset_path is None or args.dataset_name == "random":
        sample_kwargs["range_ratio"] = args.random_range_ratio
        sample_kwargs["prefix_len"] = args.prefix_len
@ -310,25 +347,31 @@ def get_requests(args, tokenizer):
            sample_kwargs["enable_multimodal_chat"] = True
    elif args.dataset_name == "sonnet":
        assert tokenizer.chat_template or tokenizer.default_chat_template, (
-            "Tokenizer/model must have chat template for sonnet dataset.")
+            "Tokenizer/model must have chat template for sonnet dataset."
+        )
        dataset_cls = SonnetDataset
        sample_kwargs["prefix_len"] = args.prefix_len
        sample_kwargs["return_prompt_formatted"] = True
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
-        if args.backend != "vllm-chat":
-            raise ValueError(
-                "hf datasets only are supported by vllm-chat backend")
-        # Choose between VisionArenaDataset and HuggingFaceDataset based on
-        # provided parameters.
-        dataset_cls = (VisionArenaDataset if args.dataset_path
-                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                       and args.hf_subset is None else HuggingFaceDataset)
-        common_kwargs['dataset_subset'] = args.hf_subset
-        common_kwargs['dataset_split'] = args.hf_split
-        sample_kwargs["enable_multimodal_chat"] = True
-
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs["dataset_split"] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs["dataset_subset"] = args.hf_subset
+            common_kwargs["dataset_split"] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -343,10 +386,10 @@ def main(args: argparse.Namespace):
    random.seed(args.seed)
    # Sample the requests.
    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
    requests = get_requests(args, tokenizer)
-    is_multi_modal = any(request.multi_modal_data is not None
-                         for request in requests)
+    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
    request_outputs: Optional[list[RequestOutput]] = None
    if args.backend == "vllm":
        if args.async_engine:
@ -357,23 +400,34 @@ def main(args: argparse.Namespace):
                    AsyncEngineArgs.from_cli_args(args),
                    args.disable_frontend_multiprocessing,
                    args.disable_detokenize,
-                ))
+                )
+            )
        else:
            elapsed_time, request_outputs = run_vllm(
-                requests, args.n, EngineArgs.from_cli_args(args),
-                args.disable_detokenize)
+                requests,
+                args.n,
+                EngineArgs.from_cli_args(args),
+                args.disable_detokenize,
+            )
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
-        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.hf_max_batch_size, args.trust_remote_code,
-                              args.disable_detokenize)
+        elapsed_time = run_hf(
+            requests,
+            args.model,
+            tokenizer,
+            args.n,
+            args.hf_max_batch_size,
+            args.trust_remote_code,
+            args.disable_detokenize,
+        )
    elif args.backend == "mii":
-        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
-                               args.output_len)
+        elapsed_time = run_mii(
+            requests, args.model, args.tensor_parallel_size, args.output_len
+        )
    elif args.backend == "vllm-chat":
        elapsed_time, request_outputs = run_vllm_chat(
-            requests, args.n, EngineArgs.from_cli_args(args),
-            args.disable_detokenize)
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
+        )
    else:
        raise ValueError(f"Unknown backend: {args.backend}")

@ -385,28 +439,31 @@ def main(args: argparse.Namespace):
        for ro in request_outputs:
            if not isinstance(ro, RequestOutput):
                continue
-            total_prompt_tokens += len(
-                ro.prompt_token_ids) if ro.prompt_token_ids else 0
-            total_output_tokens += sum(
-                len(o.token_ids) for o in ro.outputs if o)
+            total_prompt_tokens += (
+                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            )
+            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
        total_num_tokens = total_prompt_tokens + total_output_tokens
    else:
-        total_num_tokens = sum(r.prompt_len + r.expected_output_len
-                               for r in requests)
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
        total_output_tokens = sum(r.expected_output_len for r in requests)
        total_prompt_tokens = total_num_tokens - total_output_tokens

    if is_multi_modal and args.backend != "vllm-chat":
-        print("\033[91mWARNING\033[0m: Multi-modal request with "
-              f"{args.backend} backend detected. The "
-              "following metrics are not accurate because image tokens are not"
-              " counted. See vllm-project/vllm/issues/9778 for details.")
+        print(
+            "\033[91mWARNING\033[0m: Multi-modal request with "
+            f"{args.backend} backend detected. The "
+            "following metrics are not accurate because image tokens are not"
+            " counted. See vllm-project/vllm/issues/9778 for details."
+        )
        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
        # vllm-chat backend counts the image tokens now

-    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+    )
    print(f"Total num prompt tokens:  {total_prompt_tokens}")
    print(f"Total num output tokens:  {total_output_tokens}")

@ -434,7 +491,8 @@ def validate_args(args):
        warnings.warn(
            "The '--dataset' argument will be deprecated in the next release. "
            "Please use '--dataset-name' and '--dataset-path' instead.",
-            stacklevel=2)
+            stacklevel=2,
+        )
        args.dataset_path = args.dataset

    if not getattr(args, "tokenizer", None):
@ -447,9 +505,8 @@ def validate_args(args):

    # === Dataset Configuration ===
    if not args.dataset and not args.dataset_path:
-        print(
-            "When dataset path is not set, it will default to random dataset")
-        args.dataset_name = 'random'
+        print("When dataset path is not set, it will default to random dataset")
+        args.dataset_name = "random"
        if args.input_len is None:
            raise ValueError("input_len must be provided for a random dataset")

@ -457,33 +514,55 @@ def validate_args(args):
    # --hf-subset and --hf-split: only used
    # when dataset_name is 'hf'
    if args.dataset_name != "hf" and (
-            getattr(args, "hf_subset", None) is not None
-            or getattr(args, "hf_split", None) is not None):
-        warnings.warn("--hf-subset and --hf-split will be ignored \
+        getattr(args, "hf_subset", None) is not None
+        or getattr(args, "hf_split", None) is not None
+    ):
+        warnings.warn(
+            "--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
-                      stacklevel=2)
-    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
-        raise ValueError(
-            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+            stacklevel=2,
+        )
+    elif args.dataset_name == "hf":
+        if args.dataset_path in (
+            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+            | ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm-chat", (
+                f"{args.dataset_path} needs to use vllm-chat as the backend."
+            )  # noqa: E501
+        elif args.dataset_path in (
+            InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            | AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm", (
+                f"{args.dataset_path} needs to use vllm as the backend."
+            )  # noqa: E501
+        else:
+            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")

    # --random-range-ratio: only used when dataset_name is 'random'
-    if args.dataset_name != 'random' and args.random_range_ratio is not None:
-        warnings.warn("--random-range-ratio will be ignored since \
+    if args.dataset_name != "random" and args.random_range_ratio is not None:
+        warnings.warn(
+            "--random-range-ratio will be ignored since \
                --dataset-name is not 'random'.",
-                      stacklevel=2)
+            stacklevel=2,
+        )

    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
    # set.
-    if args.dataset_name not in {"random", "sonnet", None
-                                 } and args.prefix_len is not None:
-        warnings.warn("--prefix-len will be ignored since --dataset-name\
+    if (
+        args.dataset_name not in {"random", "sonnet", None}
+        and args.prefix_len is not None
+    ):
+        warnings.warn(
+            "--prefix-len will be ignored since --dataset-name\
                 is not 'random', 'sonnet', or not set.",
-                      stacklevel=2)
+            stacklevel=2,
+        )

    # === LoRA Settings ===
    if getattr(args, "enable_lora", False) and args.backend != "vllm":
-        raise ValueError(
-            "LoRA benchmarking is only supported for vLLM backend")
+        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
    if getattr(args, "enable_lora", False) and args.lora_path is None:
        raise ValueError("LoRA path must be provided when enable_lora is True")

@ -493,8 +572,10 @@ def validate_args(args):
    if args.backend != "hf" and args.hf_max_batch_size is not None:
        raise ValueError("HF max batch size is only for HF backend.")

-    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
-                                                 None) is not None:
+    if (
+        args.backend in {"hf", "mii"}
+        and getattr(args, "quantization", None) is not None
+    ):
        raise ValueError("Quantization is only for vLLM backend.")

    if args.backend == "mii" and args.dtype != "auto":
@ -502,22 +583,32 @@ def validate_args(args):
    if args.backend == "mii" and args.n != 1:
        raise ValueError("n must be 1 for MII backend.")
    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError("Tokenizer must be the same as the model for MII backend.")
+
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
        raise ValueError(
-            "Tokenizer must be the same as the model for MII backend.")
+            "Data parallel is not supported in offline benchmark, \
+            please use benchmark serving instead"
+        )


 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
-    parser.add_argument("--backend",
-                        type=str,
-                        choices=["vllm", "hf", "mii", "vllm-chat"],
-                        default="vllm")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=["vllm", "hf", "mii", "vllm-chat"],
+        default="vllm",
+    )
    parser.add_argument(
        "--dataset-name",
        type=str,
        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
        help="Name of the dataset to benchmark on.",
-        default="sharegpt")
+        default="sharegpt",
+    )
    parser.add_argument(
        "--dataset",
        type=str,
@ -525,80 +616,104 @@ if __name__ == "__main__":
        help="Path to the ShareGPT dataset, will be deprecated in\
            the next release. The dataset is expected to "
        "be a json in form of list[dict[..., conversations: "
-        "list[dict[..., value: <prompt_or_response>]]]]")
-    parser.add_argument("--dataset-path",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset")
-    parser.add_argument("--input-len",
-                        type=int,
-                        default=None,
-                        help="Input prompt length for each request")
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=None,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=1000,
-                        help="Number of prompts to process.")
-    parser.add_argument("--hf-max-batch-size",
-                        type=int,
-                        default=None,
-                        help="Maximum batch size for HF backend.")
+        "list[dict[..., value: <prompt_or_response>]]]]",
+    )
    parser.add_argument(
-        '--output-json',
+        "--dataset-path", type=str, default=None, help="Path to the dataset"
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--hf-max-batch-size",
+        type=int,
+        default=None,
+        help="Maximum batch size for HF backend.",
+    )
+    parser.add_argument(
+        "--output-json",
        type=str,
        default=None,
-        help='Path to save the throughput results in JSON format.')
-    parser.add_argument("--async-engine",
-                        action='store_true',
-                        default=False,
-                        help="Use vLLM async engine rather than LLM class.")
-    parser.add_argument("--disable-frontend-multiprocessing",
-                        action='store_true',
-                        default=False,
-                        help="Disable decoupled async engine frontend.")
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--async-engine",
+        action="store_true",
+        default=False,
+        help="Use vLLM async engine rather than LLM class.",
+    )
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        default=False,
+        help="Disable decoupled async engine frontend.",
+    )
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
-        help=("Do not detokenize the response (i.e. do not include "
-              "detokenization time in the measurement)"))
+        help=(
+            "Do not detokenize the response (i.e. do not include "
+            "detokenization time in the measurement)"
+        ),
+    )
    # LoRA
    parser.add_argument(
        "--lora-path",
        type=str,
        default=None,
-        help="Path to the lora adapters to use. This can be an absolute path, "
-        "a relative path, or a Hugging Face model identifier.")
-    parser.add_argument("--prefix-len",
-                        type=int,
-                        default=None,
-                        help="Number of prefix tokens per request."
-                        "This is for the RandomDataset and SonnetDataset")
+        help="Path to the LoRA adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.",
+    )
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=None,
+        help=f"Number of prefix tokens to be used in RandomDataset "
+        "and SonnetDataset. For RandomDataset, the total input "
+        "length is the sum of prefix-len (default: "
+        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
+        "sampled from [input_len * (1 - range_ratio), "
+        "input_len * (1 + range_ratio)]. For SonnetDataset, "
+        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
+        "controls how much of the input is fixed lines versus "
+        "random lines, but the total input length remains approximately "
+        "input_len tokens.",
+    )
    # random dataset
    parser.add_argument(
        "--random-range-ratio",
        type=float,
        default=None,
-        help="Range of sampled ratio of input/output length, "
-        "used only for RandomDataSet.",
+        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
+        "for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to "
+        "define a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )

    # hf dtaset
-    parser.add_argument("--hf-subset",
-                        type=str,
-                        default=None,
-                        help="Subset of the HF dataset.")
-    parser.add_argument("--hf-split",
-                        type=str,
-                        default=None,
-                        help="Split of the HF dataset.")
+    parser.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    parser.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )

    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -7,9 +7,9 @@ import os
 from typing import Any


-def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                        metrics: dict[str, list],
-                                        extra_info: dict[str, Any]) -> list:
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
+) -> list:
    """
    Save the benchmark results in the format used by PyTorch OSS benchmark with
    on metric per record
@ -37,12 +37,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
            },
        }

-        tp = record["benchmark"]["extra_info"]["args"].get(
-            "tensor_parallel_size")
+        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
        # Save tensor_parallel_size parameter if it's part of the metadata
        if not tp and "tensor_parallel_size" in extra_info:
-            record["benchmark"]["extra_info"]["args"][
-                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
+                extra_info["tensor_parallel_size"]
+            )

        records.append(record)

@ -50,7 +50,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,


 class InfEncoder(json.JSONEncoder):
-
    def clear_inf(self, o: Any):
        if isinstance(o, dict):
            return {k: self.clear_inf(v) for k, v in o.items()}
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -23,8 +23,9 @@ DEFAULT_TP_SIZES = [1]


 # bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
+def bench_fn(
+    label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
+) -> TMeasurement:
    min_run_time = 1

    globals = {
@ -41,16 +42,18 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
    ).blocked_autorange(min_run_time=min_run_time)


-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
+def bench_int8(
+    dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
+) -> Iterable[TMeasurement]:
    assert dtype == torch.int8
    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)

-    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
-                                       torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_compressed, e, scale_a, scale_b, torch.bfloat16
+    )
    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)

    if not torch.allclose(out, out_ref):
@ -63,54 +66,107 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    timers = []
    # pytorch impl - bfloat16
    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_bf16_bf16_bf16_matmul-no-scales",
+            torch.mm,
+            a.to(dtype=torch.bfloat16),
+            b.to(dtype=torch.bfloat16),
+        )
+    )

    # pytorch impl - float16
    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp16_fp16_fp16_matmul-no-scales",
+            torch.mm,
+            a.to(dtype=torch.float16),
+            b.to(dtype=torch.float16),
+        )
+    )

    # cutlass impl
    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_mm",
+            ops.cutlass_scaled_mm,
+            a,
+            b,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )

    # cutlass with bias
    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_mm_bias",
+            ops.cutlass_scaled_mm,
+            a,
+            b,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+            bias,
+        )
+    )

    # cutlass sparse impl
    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
-                 scale_b, torch.bfloat16))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_sparse_mm",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )

    # cutlass sparse with bias
    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
-                 scale_b, torch.bfloat16, bias))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+            bias,
+        )
+    )

    return timers


-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+def bench_fp8(
+    dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
+) -> Iterable[TMeasurement]:
    assert dtype == torch.float8_e4m3fn
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
-                                                     k)
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)

-    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
-                                       torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_compressed, e, scale_a, scale_b, torch.bfloat16
+    )
    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)

    if not torch.allclose(out, out_ref):
@ -124,97 +180,165 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,

    # pytorch impl w. bf16
    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_bf16_bf16_bf16_matmul-no-scales",
+            torch.mm,
+            a.to(dtype=torch.bfloat16, device="cuda"),
+            b.to(dtype=torch.bfloat16, device="cuda"),
+        )
+    )

    # pytorch impl: bf16 output, without fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_bf16_scaled_mm",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+        )
+    )

    # pytorch impl: bf16 output, with fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=True,
+        )
+    )

    # pytorch impl: fp16 output, without fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_fp16_scaled_mm",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.float16,
+        )
+    )

    # pytorch impl: fp16 output, with fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
+        bench_fn(
+            label,
+            sub_label,
+            "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+            torch._scaled_mm,
+            a,
+            b,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.float16,
+            use_fast_accum=True,
+        )
+    )

    # cutlass impl: bf16 output
    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_bf16_scaled_mm",
+            ops.cutlass_scaled_mm,
+            a,
+            b,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )

    # cutlass impl: bf16 output
    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
-                 scale_b, torch.bfloat16))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+        )
+    )

    # cutlass impl: fp16 output
    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
-                 scale_b, torch.float16))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.float16,
+        )
+    )

    # cutlass impl: bf16 output, with bias
    timers.append(
-        bench_fn(label, sub_label,
-                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
-                 scale_b, torch.bfloat16, bias))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.bfloat16,
+            bias,
+        )
+    )

    # cutlass impl: fp16 output, with bias
    timers.append(
-        bench_fn(label, sub_label,
-                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
-                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
+        bench_fn(
+            label,
+            sub_label,
+            "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+            ops.cutlass_scaled_sparse_mm,
+            a,
+            b_compressed,
+            e,
+            scale_a,
+            scale_b,
+            torch.float16,
+            bias.to(dtype=torch.float16),
+        )
+    )

    return timers


-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
+def bench(
+    dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
+) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label)
    if dtype == torch.float8_e4m3fn:
@ -228,12 +352,12 @@ def print_timers(timers: Iterable[TMeasurement]):
    compare.print()


-def run(dtype: torch.dtype,
-        MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
+def run(
+    dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]]
+) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})")
        print_timers(timers)
        results.extend(timers)

@ -241,10 +365,12 @@ def run(dtype: torch.dtype,


 # output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
+def make_output(
+    data: Iterable[TMeasurement],
+    MKNs: Iterable[tuple[int, int, int]],
+    base_description: str,
+    timestamp=None,
+):
    print(f"== All Results {base_description} ====")
    print_timers(data)

@ -258,8 +384,7 @@ def make_output(data: Iterable[TMeasurement],


 def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
    data = run(args.dtype, MKNs)

@ -319,7 +444,7 @@ def run_model_bench(args):
        pkl.dump(all_data, f)


-if __name__ == '__main__':
+if __name__ == "__main__":

    def to_torch_dtype(dt):
        if dt == "int8":
@ -344,12 +469,15 @@ Benchmark Cutlass GEMM.
    Output:
        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
+        formatter_class=argparse.RawTextHelpFormatter,
+    )

-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
+    parser.add_argument(
+        "--dtype",
+        type=to_torch_dtype,
+        required=True,
+        help="Available options are ['int8', 'fp8']",
+    )
    subparsers = parser.add_subparsers(dest="cmd")

    square_parser = subparsers.add_parser("square_bench")
@ -368,19 +496,19 @@ Benchmark Cutlass GEMM.
    range_parser.set_defaults(func=run_range_bench)

    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument(
+        "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
+    )
+    model_parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
    model_parser.set_defaults(func=run_model_bench)

    args = parser.parse_args()
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@ -10,8 +10,9 @@ import vllm._custom_ops as ops

 def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )


 def to_int8(tensor: torch.Tensor) -> torch.Tensor:
@ -26,10 +27,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
    return tensor.to(dtype=torch.float16)


-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
+def make_rand_tensors(
+    dtype: torch.dtype, m: int, n: int, k: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device="cuda") * 5
+    b = torch.randn((n, k), device="cuda").t() * 5

    if dtype == torch.int8:
        return to_int8(a), to_int8(b)
@ -49,9 +51,7 @@ def prune_to_2_4(tensor):

    # Create binary mask
    mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
+    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))

    # Apply mask and reshape back
    pruned = reshaped * mask
@ -62,10 +62,11 @@ def prune_to_2_4(tensor):
    return pruned.reshape(original_shape)


-def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
-                             k: int) -> tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
+def make_rand_sparse_tensors(
+    dtype: torch.dtype, m: int, n: int, k: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device="cuda") * 5
+    b = torch.randn((n, k), device="cuda").t() * 5

    b = prune_to_2_4(b.t()).t()

@ -86,9 +87,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
    return b_compressed, e, a, b


-def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
-                        m: int, n: int, k: int) -> \
-                        tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+def make_n_rand_sparse_tensors(
+    num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
+) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
    ABs = []
    for _ in range(num_tensors):
        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -16,7 +16,8 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul)
+    w8a8_block_fp8_matmul,
+)
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -25,8 +26,9 @@ DEFAULT_TP_SIZES = [1]


 # bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
+def bench_fn(
+    label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
+) -> TMeasurement:
    min_run_time = 1

    globals = {
@ -44,45 +46,48 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,


 def bench_int8(
-        dtype: torch.dtype,
-        m: int,
-        k: int,
-        n: int,
-        label: str,
-        sub_label: str,
-        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
+    dtype: torch.dtype,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
    a, b = make_rand_tensors(torch.int8, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m,), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32)

    bench_fns = {
-        "pytorch_bf16_bf16_bf16_matmul-no-scales":
-        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
-                         ),
-        "pytorch_fp16_fp16_fp16_matmul-no-scales":
-        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
-        "cutlass_i8_i8_bf16_scaled_mm":
-        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
-        "cutlass_i8_i8_bf16_scaled_mm_bias":
-        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
-                                      bias),
-        "cutlass_i8_i8_bf16_scaled_mm_azp":
-        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
-                                          bfloat16, azp_adj),
-        "cutlass_i8_i8_bf16_scaled_mm_azp_bias":
-        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
-                                          bfloat16, azp_adj, None, bias),
-        "cutlass_i8_i8_bf16_scaled_mm_azp_pt":
-        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
-                                          bfloat16, azp_adj, azp),
-        "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
-        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
-                                          bfloat16, azp_adj, azp, bias),
+        "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+        ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.float16), b.to(dtype=torch.float16)
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16, bias
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp
+        ),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp(
+            a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias
+        ),
    }

    timers = []
@ -96,73 +101,73 @@ def bench_int8(


 def bench_fp8(
-        dtype: torch.dtype,
-        m: int,
-        k: int,
-        n: int,
-        label: str,
-        sub_label: str,
-        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
+    dtype: torch.dtype,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
    a_cont = a.contiguous()
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    block_scale_a = torch.rand((m, k // 128),
-                               device="cuda",
-                               dtype=torch.float32)
-    block_scale_b = torch.rand((k // 128, n // 128),
-                               device="cuda",
-                               dtype=torch.float32)
+
+    def ceil_div(x: int, y: int) -> int:
+        return (x + y - 1) // y
+
+    block_scale_a = torch.rand(
+        (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
+    )
+    block_scale_b = torch.rand(
+        ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
+    )
    block_scale_a_M_major = block_scale_a.t().contiguous().t()
    block_scale_b_K_major = block_scale_b.t().contiguous().t()
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)

    print(m, k, n)

    bench_fns = {
-        "pytorch_bf16_bf16_bf16_matmul-no-scales":
-        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
-                         ),
-        "pytorch_fp16_fp16_fp16_matmul-no-scales":
-        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
-        "pytorch_fp8_fp8_fp16_scaled_mm":
-        lambda: torch._scaled_mm(
-            a, b, scale_a, scale_b, out_dtype=torch.float16),
-        "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
-        lambda: torch._scaled_mm(a,
-                                 b,
-                                 scale_a,
-                                 scale_b,
-                                 out_dtype=torch.float16,
-                                 use_fast_accum=True),
-        "pytorch_fp8_fp8_bf16_scaled_mm":
-        lambda: torch._scaled_mm(
-            a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
-        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
-        lambda: torch._scaled_mm(a,
-                                 b,
-                                 scale_a,
-                                 scale_b,
-                                 out_dtype=torch.bfloat16,
-                                 use_fast_accum=True),
-        "cutlass_fp8_fp8_bf16_scaled_mm":
-        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
-        "cutlass_fp8_fp8_fp16_scaled_mm":
-        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
-        "cutlass_fp8_fp8_bf16_scaled_mm_bias":
-        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
-                                      bias),
-        "cutlass_fp8_fp8_fp16_scaled_mm_bias":
-        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
-                                      bias.to(dtype=torch.float16)),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise":
-        lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
-                                      block_scale_b.t(), (128, 128)),
-        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
-        lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
-                                      block_scale_b_K_major, torch.float16),
+        "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+        ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
+            a.to(dtype=torch.float16), b.to(dtype=torch.float16)
+        ),
+        "pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16
+        ),
+        "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True
+        ),
+        "pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16
+        ),
+        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True
+        ),
+        "cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16
+        ),
+        "cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.float16
+        ),
+        "cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.bfloat16, bias
+        ),
+        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
+            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
+        ),
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
+            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
+        ),
+        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
+            a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16
+        ),
    }

    timers = []
@ -175,13 +180,15 @@ def bench_fp8(
    return timers


-def bench(dtype: torch.dtype,
-          m: int,
-          k: int,
-          n: int,
-          label: str,
-          sub_label: str,
-          bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
+def bench(
+    dtype: torch.dtype,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
    if dtype == torch.float8_e4m3fn:
@ -195,27 +202,33 @@ def print_timers(timers: Iterable[TMeasurement]):
    compare.print()


-def run(dtype: torch.dtype,
-        MKNs: Iterable[tuple[int, int, int]],
-        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
+def run(
+    dtype: torch.dtype,
+    MKNs: Iterable[tuple[int, int, int]],
+    bench_kernels: Optional[list[str]] = None,
+) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
-        timers = bench(dtype,
-                       m,
-                       k,
-                       n,
-                       f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})",
-                       bench_kernels=bench_kernels)
+        timers = bench(
+            dtype,
+            m,
+            k,
+            n,
+            f"scaled-{dtype}-gemm",
+            f"MKN=({m}x{k}x{n})",
+            bench_kernels=bench_kernels,
+        )
        print_timers(timers)
        results.extend(timers)
    return results


-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
+def make_output(
+    data: Iterable[TMeasurement],
+    MKNs: Iterable[tuple[int, int, int]],
+    base_description: str,
+    timestamp=None,
+):
    print(f"== All Results {base_description} ====")
    print_timers(data)

@ -226,8 +239,7 @@ def make_output(data: Iterable[TMeasurement],


 def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
    make_output(data, MKNs, f"square_bench-{args.dtype}")
@ -285,7 +297,7 @@ def run_model_bench(args):
        pkl.dump(all_data, f)


-if __name__ == '__main__':
+if __name__ == "__main__":

    def to_torch_dtype(dt):
        if dt == "int8":
@ -310,19 +322,21 @@ Benchmark Cutlass GEMM.
    Output:
        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
+        formatter_class=argparse.RawTextHelpFormatter,
+    )

-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
+    parser.add_argument(
+        "--dtype",
+        type=to_torch_dtype,
+        required=True,
+        help="Available options are ['int8', 'fp8']",
+    )
    parser.add_argument(
        "--kernels",
        nargs="+",
        type=str,
        default=None,
-        help=
-        "Exact names of the kernels to benchmark. If not set, runs all kernels."
+        help="Exact names of the kernels to benchmark. If not set, runs all kernels.",
    )

    subparsers = parser.add_subparsers(dest="cmd")
@ -343,19 +357,19 @@ Benchmark Cutlass GEMM.
    range_parser.set_defaults(func=run_range_bench)

    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument(
+        "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
+    )
+    model_parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
    model_parser.set_defaults(func=run_model_bench)

    args = parser.parse_args()
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -12,39 +12,37 @@ app = Quart(__name__)

 async def forward_request(url, data):
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-        }
-        async with session.post(url=url, json=data,
-                                headers=headers) as response:
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        async with session.post(url=url, json=data, headers=headers) as response:
            if response.status == 200:
                # if response.headers.get('Transfer-Encoding') == 'chunked':
                if True:
-                    async for chunk_bytes in response.content.iter_chunked(
-                            1024):
+                    async for chunk_bytes in response.content.iter_chunked(1024):
                        yield chunk_bytes
                else:
                    content = await response.read()
                    yield content


-@app.route('/v1/completions', methods=['POST'])
+@app.route("/v1/completions", methods=["POST"])
 async def handle_request():
    try:
        original_request_data = await request.get_json()

        prefill_request = original_request_data.copy()
        # change max_tokens = 1 to let it only do prefill
-        prefill_request['max_tokens'] = 1
+        prefill_request["max_tokens"] = 1

        # finish prefill
-        async for _ in forward_request('http://localhost:8100/v1/completions',
-                                       prefill_request):
+        async for _ in forward_request(
+            "http://localhost:8100/v1/completions", prefill_request
+        ):
            continue

        # return decode
-        generator = forward_request('http://localhost:8200/v1/completions',
-                                    original_request_data)
+        generator = forward_request(
+            "http://localhost:8200/v1/completions", original_request_data
+        )
        response = await make_response(generator)
        response.timeout = None

@ -53,11 +51,12 @@ async def handle_request():
    except Exception as e:
        import sys
        import traceback
+
        exc_info = sys.exc_info()
        print("Error occurred in disagg prefill proxy server")
        print(e)
        print("".join(traceback.format_exception(*exc_info)))


-if __name__ == '__main__':
+if __name__ == "__main__":
    app.run(port=8000)
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@ -8,7 +8,6 @@ from aiohttp import web


 class RoundRobinProxy:
-
    def __init__(self, target_ports):
        self.target_ports = target_ports
        self.port_cycle = itertools.cycle(self.target_ports)
@ -21,14 +20,15 @@ class RoundRobinProxy:
            try:
                # Forward the request
                async with session.request(
-                        method=request.method,
-                        url=target_url,
-                        headers=request.headers,
-                        data=request.content,
+                    method=request.method,
+                    url=target_url,
+                    headers=request.headers,
+                    data=request.content,
                ) as response:
                    # Start sending the response
-                    resp = web.StreamResponse(status=response.status,
-                                              headers=response.headers)
+                    resp = web.StreamResponse(
+                        status=response.status, headers=response.headers
+                    )
                    await resp.prepare(request)

                    # Stream the response content
@ -45,11 +45,11 @@ class RoundRobinProxy:
 async def main():
    proxy = RoundRobinProxy([8100, 8200])
    app = web.Application()
-    app.router.add_route('*', '/{path:.*}', proxy.handle_request)
+    app.router.add_route("*", "/{path:.*}", proxy.handle_request)

    runner = web.AppRunner(app)
    await runner.setup()
-    site = web.TCPSite(runner, 'localhost', 8000)
+    site = web.TCPSite(runner, "localhost", 8000)
    await site.start()

    print("Proxy server started on http://localhost:8000")
@ -58,5 +58,5 @@ async def main():
    await asyncio.Event().wait()


-if __name__ == '__main__':
+if __name__ == "__main__":
    asyncio.run(main())
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@ -6,43 +6,41 @@ import matplotlib.pyplot as plt
 import pandas as pd

 if __name__ == "__main__":
-
    data = []
-    for name in ['disagg_prefill', 'chunked_prefill']:
+    for name in ["disagg_prefill", "chunked_prefill"]:
        for qps in [2, 4, 6, 8]:
            with open(f"results/{name}-qps-{qps}.json") as f:
                x = json.load(f)
-                x['name'] = name
-                x['qps'] = qps
+                x["name"] = name
+                x["qps"] = qps
                data.append(x)

    df = pd.DataFrame.from_dict(data)
-    dis_df = df[df['name'] == 'disagg_prefill']
-    chu_df = df[df['name'] == 'chunked_prefill']
+    dis_df = df[df["name"] == "disagg_prefill"]
+    chu_df = df[df["name"] == "chunked_prefill"]

-    plt.style.use('bmh')
-    plt.rcParams['font.size'] = 20
+    plt.style.use("bmh")
+    plt.rcParams["font.size"] = 20

    for key in [
-            'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
-            'median_itl_ms', 'p99_itl_ms'
+        "mean_ttft_ms",
+        "median_ttft_ms",
+        "p99_ttft_ms",
+        "mean_itl_ms",
+        "median_itl_ms",
+        "p99_itl_ms",
    ]:
-
        fig, ax = plt.subplots(figsize=(11, 7))
-        plt.plot(dis_df['qps'],
-                 dis_df[key],
-                 label='disagg_prefill',
-                 marker='o',
-                 linewidth=4)
-        plt.plot(chu_df['qps'],
-                 chu_df[key],
-                 label='chunked_prefill',
-                 marker='o',
-                 linewidth=4)
+        plt.plot(
+            dis_df["qps"], dis_df[key], label="disagg_prefill", marker="o", linewidth=4
+        )
+        plt.plot(
+            chu_df["qps"], chu_df[key], label="chunked_prefill", marker="o", linewidth=4
+        )
        ax.legend()

-        ax.set_xlabel('QPS')
+        ax.set_xlabel("QPS")
        ax.set_ylabel(key)
        ax.set_ylim(bottom=0)
-        fig.savefig(f'results/{key}.png')
+        fig.savefig(f"results/{key}.png")
        plt.close(fig)
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -24,10 +24,12 @@ class bench_params_t:
    dtype: torch.dtype

    def description(self):
-        return (f'N {self.num_tokens} '
-                f'x D {self.hidden_size} '
-                f'x R {self.add_residual} '
-                f'x DT {self.dtype}')
+        return (
+            f"N {self.num_tokens} "
+            f"x D {self.hidden_size} "
+            f"x R {self.add_residual} "
+            f"x DT {self.dtype}"
+        )


 def get_bench_params() -> list[bench_params_t]:
@ -38,15 +40,19 @@ def get_bench_params() -> list[bench_params_t]:
    DTYPES = [torch.bfloat16, torch.float]

    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
-    bench_params = list(map(lambda x: \
-        bench_params_t(x[0], x[1], x[2], x[3]), combinations))
+    bench_params = list(
+        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+    )
    return bench_params


 # Reference impls
-def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
-                      residual: Optional[torch.Tensor],
-                      quant_dtype: torch.dtype):
+def unfused_int8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+):
    # Norm
    torch_out = None
    if residual is None:
@ -58,9 +64,12 @@ def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
    torch_out, _, _ = ops.scaled_int8_quant(torch_out)


-def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
-                     residual: Optional[torch.Tensor],
-                     quant_dtype: torch.dtype):
+def unfused_fp8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+):
    # Norm
    torch_out = None
    if residual is None:
@ -73,22 +82,27 @@ def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,


 def fused_impl(
-        rms_norm_layer: RMSNorm,  # this stores the weights
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor],
-        quant_dtype: torch.dtype):
-    out, _ = ops.rms_norm_dynamic_per_token_quant(x,
-                                                  rms_norm_layer.weight,
-                                                  1e-6,
-                                                  quant_dtype,
-                                                  residual=residual)
+    rms_norm_layer: RMSNorm,  # this stores the weights
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+):
+    out, _ = ops.rms_norm_dynamic_per_token_quant(
+        x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
+    )


 # Bench functions
-def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
-             quant_dtype: torch.dtype, label: str, sub_label: str,
-             fn: Callable, description: str) -> TMeasurement:
-
+def bench_fn(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    quant_dtype: torch.dtype,
+    label: str,
+    sub_label: str,
+    fn: Callable,
+    description: str,
+) -> TMeasurement:
    min_run_time = 1

    globals = {
@ -106,43 +120,81 @@ def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
        description=description,
    ).blocked_autorange(min_run_time=min_run_time)

-def bench(params: bench_params_t, label: str, sub_label: str) \
-        -> Iterable[TMeasurement]:

+def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasurement]:
    # Make inputs
    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
    # Make weights
    layer.weight.data.normal_(mean=1.0, std=0.1)
    # Make inputs
    scale = 1 / params.hidden_size
-    x = torch.randn(params.num_tokens,
-                    params.hidden_size,
-                    dtype=params.dtype,
-                    device='cuda') * scale
-    residual = (torch.randn_like(x) * scale).to(device='cuda') \
-            if params.add_residual else None
+    x = (
+        torch.randn(
+            params.num_tokens, params.hidden_size, dtype=params.dtype, device="cuda"
+        )
+        * scale
+    )
+    residual = (
+        (torch.randn_like(x) * scale).to(device="cuda") if params.add_residual else None
+    )

    timers = []

    # unfused int8 impl.
    timers.append(
-        bench_fn(layer, x, residual, torch.int8, label, sub_label,
-                 unfused_int8_impl, "unfused_int8_impl"))
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.int8,
+            label,
+            sub_label,
+            unfused_int8_impl,
+            "unfused_int8_impl",
+        )
+    )

    # unfused fp8 impl.
    timers.append(
-        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
-                 unfused_fp8_impl, "unfused_fp8_impl"))
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            label,
+            sub_label,
+            unfused_fp8_impl,
+            "unfused_fp8_impl",
+        )
+    )

    # fused int8 impl.
    timers.append(
-        bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
-                 "fused_int8_impl"))
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.int8,
+            label,
+            sub_label,
+            fused_impl,
+            "fused_int8_impl",
+        )
+    )

    # fused fp8 impl.
    timers.append(
-        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
-                 fused_impl, "fused_fp8_impl"))
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            label,
+            sub_label,
+            fused_impl,
+            "fused_fp8_impl",
+        )
+    )

    print_timers(timers)

@ -157,13 +209,12 @@ def print_timers(timers: Iterable[TMeasurement]):


 def main():
-    torch.set_default_device('cuda')
+    torch.set_default_device("cuda")
    bench_params = get_bench_params()

    timers = []
    for bp in tqdm(bench_params):
-        timers.extend(
-            bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+        timers.extend(bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
    print_timers(timers)

    # pickle all the results
@ -172,5 +223,5 @@ def main():
        pkl.dump(timers, f)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -9,32 +9,39 @@ import torch.nn.functional as F

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.aqlm import (
-    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
-    optimized_dequantize_gemm)
+    dequantize_weight,
+    generic_dequantize_gemm,
+    get_int_dtype,
+    optimized_dequantize_gemm,
+)
 from vllm.utils import FlexibleArgumentParser

-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"


 def torch_mult(
-        input: torch.Tensor,  #  [..., in_features]
-        weights: torch.Tensor,
-        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    # [..., in_features]
+    input: torch.Tensor,
+    weights: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
 ) -> torch.Tensor:
    output = F.linear(input, weights)
    return output


 def dequant_out_scale(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
    output_partition_sizes: torch.IntTensor,
    bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
-
    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)

    if bias is None:
@ -46,40 +53,42 @@ def dequant_out_scale(
        flattened_output *= b_scales
        return flattened_output.view(orig_shape)
    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-            -1, weights.shape[1])
+        b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
        weights *= b_scales
        return F.linear(input, weights, bias)


 def dequant_weight_scale(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
    output_partition_sizes: torch.IntTensor,
    bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
-
    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)

-    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-        -1, weights.shape[1])
+    b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
    weights *= b_scales
    return F.linear(input, weights, bias)


 def dequant_no_scale(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
    output_partition_sizes: torch.IntTensor,
    bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
-
    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)

    return F.linear(input, weights, bias)
@ -89,23 +98,26 @@ def dequant_no_scale(
 # the generic pytorch version.
 # Just visual comparison.
 def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
-
    n = int(parts.sum().item())

-    device = torch.device('cuda:0')
+    device = torch.device("cuda:0")

    code_range = (1 << bits) // 2
    ingroups = 8

-    codes = torch.randint(-code_range,
-                          code_range,
-                          size=(n, k // ingroups, nbooks),
-                          dtype=get_int_dtype(bits),
-                          device=device)
+    codes = torch.randint(
+        -code_range,
+        code_range,
+        size=(n, k // ingroups, nbooks),
+        dtype=get_int_dtype(bits),
+        device=device,
+    )

-    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-                            dtype=torch.float16,
-                            device=device)
+    codebooks = torch.randn(
+        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+        dtype=torch.float16,
+        device=device,
+    )

    count = 0
    for index in range(16):
@ -138,24 +150,25 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:


 def main():
-
    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")

    # Add arguments
-    parser.add_argument("--nbooks",
-                        type=int,
-                        default=1,
-                        help="Number of codebooks (default: 1)")
-    parser.add_argument("--bits",
-                        type=int,
-                        default=16,
-                        help="Number of bits per code element (default: 16)")
+    parser.add_argument(
+        "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
+    )
+    parser.add_argument(
+        "--bits",
+        type=int,
+        default=16,
+        help="Number of bits per code element (default: 16)",
+    )
    parser.add_argument(
        "--test",
        type=bool,
        default=False,
        help="Run the decompression/dequant tester rather than benchmarking "
-        "(default: False)")
+        "(default: False)",
+    )

    # Parse the arguments
    args = parser.parse_args()
@ -165,7 +178,7 @@ def main():
    bits = args.bits

    if args.test:
-        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+        dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
        return

    # Otherwise, benchmark.
@ -184,31 +197,54 @@ def main():
    with open(filename, "w") as f:
        sys.stdout = f

-        print('m | k | n | n parts', end='')
+        print("m | k | n | n parts", end="")
        for method in methods:
-            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
-        print('')
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
+        print("")

        # These are reasonable prefill sizes.
-        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
-                         (4096, (11008, 11008)), (11008, (4096, )))
+        ksandpartions = (
+            (4096, (4096, 4096, 4096)),
+            (4096, (4096,)),
+            (4096, (11008, 11008)),
+            (11008, (4096,)),
+        )

        # reasonable ranges for m.
        for m in [
-                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
-                128, 256, 512, 1024, 1536, 2048, 3072, 4096
+            1,
+            2,
+            4,
+            8,
+            10,
+            12,
+            14,
+            16,
+            24,
+            32,
+            48,
+            52,
+            56,
+            64,
+            96,
+            112,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
        ]:
-            print(f'{m}', file=sys.__stdout__)
+            print(f"{m}", file=sys.__stdout__)
            for ksp in ksandpartions:
-                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
-                         methods)
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)

        sys.stdout = sys.__stdout__


-def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
-             methods):
-
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
    # I didn't see visible improvements from increasing these, but feel free :)
    num_warmup_trials = 1
    num_trials = 1
@ -229,7 +265,7 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
            )

    n = parts.sum().item()
-    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+    print(f"{m} | {k} | {n} | {parts.tolist()}", end="")

    for method in methods:
        best_time_us = 1e20
@ -249,32 +285,36 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
            if kernel_dur_us < best_time_us:
                best_time_us = kernel_dur_us

-        print(f' | {kernel_dur_us:.0f}', end='')
+        print(f" | {kernel_dur_us:.0f}", end="")

-    print('')
+    print("")


-def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
-               nbooks: int, bits: int, method) -> float:
-
+def run_timing(
+    num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
+) -> float:
    n = int(parts.sum().item())

-    device = torch.device('cuda:0')
+    device = torch.device("cuda:0")

    input = torch.randn((1, m, k), dtype=torch.float16, device=device)

    code_range = (1 << bits) // 2
    ingroups = 8

-    codes = torch.randint(-code_range,
-                          code_range,
-                          size=(n, k // ingroups, nbooks),
-                          dtype=get_int_dtype(bits),
-                          device=device)
+    codes = torch.randint(
+        -code_range,
+        code_range,
+        size=(n, k // ingroups, nbooks),
+        dtype=get_int_dtype(bits),
+        device=device,
+    )

-    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-                            dtype=torch.float16,
-                            device=device)
+    codebooks = torch.randn(
+        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+        dtype=torch.float16,
+        device=device,
+    )

    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)

--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -0,0 +1,241 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION,
+)
+
+try:
+    import bitblas
+
+    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+        raise ImportError(
+            "bitblas version is wrong. Please "
+            f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
+        )
+except ImportError as e:
+    bitblas_import_exception = e
+    raise ValueError(
+        "Trying to use the bitblas backend, but could not import"
+        f"with the following error: {bitblas_import_exception}. "
+        "Please install bitblas through the following command: "
+        f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+    ) from bitblas_import_exception
+
+from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
+
+from vllm.utils import FlexibleArgumentParser
+
+parser = FlexibleArgumentParser(
+    description="Benchmark BitBLAS int4 on a specific target."
+)
+
+# Add arguments to the parser
+parser.add_argument(
+    "--target",
+    type=str,
+    default=auto_detect_nvidia_target(),
+    help="Specify the target device for benchmarking.",
+)
+parser.add_argument(
+    "--group_size", type=int, default=None, help="Group size for grouped quantization."
+)
+parser.add_argument(
+    "--A_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "float64", "int32", "int8"],
+    help="Data type of activation A.",
+)
+parser.add_argument(
+    "--W_dtype",
+    type=str,
+    default="int4",
+    choices=[
+        "float16",
+        "float32",
+        "float64",
+        "int32",
+        "int8",
+        "int4",
+        "int2",
+        "int1",
+        "nf4",
+        "fp4_e2m1",
+    ],
+    help="Data type of weight W.",
+)
+parser.add_argument(
+    "--accum_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "int32"],
+    help="Data type for accumulation.",
+)
+parser.add_argument(
+    "--out_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "int32", "int8"],
+    help="Data type for output.",
+)
+parser.add_argument(
+    "--layout",
+    type=str,
+    default="nt",
+    choices=["nt", "nn"],
+    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
+)
+parser.add_argument(
+    "--with_bias", action="store_true", help="Include bias in the benchmark."
+)
+parser.add_argument(
+    "--with_scaling",
+    action="store_true",
+    help="Include scaling factor in the quantization.",
+)
+parser.add_argument(
+    "--with_zeros", action="store_true", help="Include zeros in the quantization."
+)
+parser.add_argument(
+    "--zeros_mode",
+    type=str,
+    default=None,
+    choices=["original", "rescale", "quantized"],
+    help="Specify the mode for calculating zeros.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign arguments to variables
+target = args.target
+A_dtype = args.A_dtype
+W_dtype = args.W_dtype
+accum_dtype = args.accum_dtype
+out_dtype = args.out_dtype
+layout = args.layout
+with_bias = args.with_bias
+group_size = args.group_size
+with_scaling = args.with_scaling
+with_zeros = args.with_zeros
+zeros_mode = args.zeros_mode
+
+# Define a list of shared arguments that repeat in every config
+shared_args = [
+    A_dtype,
+    W_dtype,
+    out_dtype,
+    accum_dtype,
+    layout,
+    with_bias,
+    group_size,
+    with_scaling,
+    with_zeros,
+    zeros_mode,
+]
+
+# Define just the (M, K, N) shapes in a more compact list
+shapes = [
+    # square test
+    (1, 16384, 16384),
+    # BLOOM-176B
+    (1, 43008, 14336),
+    (1, 14336, 14336),
+    (1, 57344, 14336),
+    (1, 14336, 57344),
+    # OPT-65B
+    (1, 9216, 9216),
+    (1, 36864, 9216),
+    (1, 9216, 36864),
+    (1, 22016, 8192),
+    # LLAMA-70B/65B
+    (1, 8192, 22016),
+    (1, 8192, 8192),
+    (1, 28672, 8192),
+    (1, 8192, 28672),
+    # square test
+    (16384, 16384, 16384),
+    # BLOOM-176B
+    (8192, 43008, 14336),
+    (8192, 14336, 14336),
+    (8192, 57344, 14336),
+    (8192, 14336, 57344),
+    # OPT-65B
+    (8192, 9216, 9216),
+    (8192, 36864, 9216),
+    (8192, 9216, 36864),
+    (8192, 22016, 8192),
+    # LLAMA-70B/65B
+    (8192, 8192, 22016),
+    (8192, 8192, 8192),
+    (8192, 28672, 8192),
+    (8192, 8192, 28672),
+]
+
+# Build test shapes with all the shared arguments
+test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
+
+benchmark_sets = []
+benchmark_sets.extend(test_shapes)
+
+benchmark_results = {}
+for config_class, operator, input_args in benchmark_sets:
+    config = config_class(*input_args)
+    matmul = operator(config, target=target, enable_tuning=True)
+    kernel_latency = matmul.profile_latency()
+
+    print("Time cost is: {:.3f} ms".format(kernel_latency))
+
+    profile_config = {
+        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
+            "BitBLAS_top20_latency": kernel_latency,
+        }
+    }
+
+    benchmark_results.update(profile_config)
+
+# Define headers for the table
+headers = [
+    "PrimFunc",
+    "Input Arguments",
+    "BitBLAS Top20 Latency",
+]
+
+# Calculate column widths for pretty printing
+col_widths = [0, 0, 0]
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
+    col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
+    col_widths[2] = max(
+        col_widths[2],
+        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
+        len(headers[2]) + 2,
+    )
+    # break only if you want to measure widths from a single example;
+    # otherwise, let it loop over all items.
+
+# Print header
+for i, header in enumerate(headers):
+    headers[i] = header.ljust(col_widths[i])
+print("".join(headers))
+print("-" * sum(col_widths))
+
+# Print rows
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    row = [
+        func_name,
+        input_args_str,
+        f"{values['BitBLAS_top20_latency']:.3f} ms",
+    ]
+    row_str = "".join(
+        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
+    )
+    print(row_str)
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@ -0,0 +1,489 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
+kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
+activations. The triton_moe kernel takes in fp8 weights(tensor scaled to fp8)
+and 16-bit activations.
+"""
+
+import nvtx
+import torch
+import torch.utils.benchmark as benchmark
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.scalar_type import scalar_types
+from vllm.utils import FlexibleArgumentParser
+
+WEIGHT_SHAPES_MOE = {
+    "nvidia/DeepSeek-R1-FP4": [
+        [256, 8, 2048, 7168],
+    ],
+}
+
+DEFAULT_MODELS = [
+    "nvidia/DeepSeek-R1-FP4",
+]
+
+DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False]
+PER_OUT_CH_OPTS = [False]
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
+
+
+def bench_run(
+    results: list[benchmark.Measurement],
+    model: str,
+    num_experts: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    mkn: tuple[int, int, int],
+):
+    label = "NVFP4 Blockscaled CUTLASS MOE vs FP8 Tensor Scaled Triton"
+
+    sub_label = (
+        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format(
+            model, num_experts, topk, per_act_token, per_out_ch, mkn
+        )
+    )
+
+    print(f"Testing: {sub_label}")
+
+    (m, k, n) = mkn
+
+    dtype = torch.half
+    device = "cuda"
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
+
+    _, a_fp8_scale = ops.scaled_fp8_quant(a)
+
+    w1_fp8q = torch.empty(
+        (num_experts, 2 * n, k), device=device, dtype=torch.float8_e4m3fn
+    )
+    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=torch.float8_e4m3fn)
+    w1_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+    w2_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+
+    for expert in range(num_experts):
+        w1_fp8q[expert], w1_fp8scale[expert] = ops.scaled_fp8_quant(w1[expert])
+        w2_fp8q[expert], w2_fp8scale[expert] = ops.scaled_fp8_quant(w2[expert])
+
+    w1_fp8q_notransp = w1_fp8q.clone()
+    w2_fp8q_notransp = w2_fp8q.clone()
+    w1_fp8q = w1_fp8q.transpose(1, 2)
+    w2_fp8q = w2_fp8q.transpose(1, 2)
+
+    score = torch.randn((m, num_experts), device=device, dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+    quant_blocksize = 16
+    w1_blockscale = torch.empty(
+        (num_experts, 2 * n, k // quant_blocksize),
+        device=device,
+        dtype=torch.float8_e4m3fn,
+    )
+    w2_blockscale = torch.empty(
+        (num_experts, k, n // quant_blocksize), device=device, dtype=torch.float8_e4m3fn
+    )
+
+    # n_b_scales = 2 * n if per_out_ch else 1
+    # k_b_scales = k if per_out_ch else 1
+    w1_fp4 = torch.empty((num_experts, 2 * n, k // 2), device=device, dtype=torch.uint8)
+    w2_fp4 = torch.empty((num_experts, k, n // 2), device=device, dtype=torch.uint8)
+
+    w1_gs = torch.empty((num_experts,), device=device, dtype=torch.float32)
+    w2_gs = torch.empty((num_experts,), device=device, dtype=torch.float32)
+    a1_gs = torch.ones((num_experts,), device=device, dtype=torch.float32)
+    a2_gs = torch.ones((num_experts,), device=device, dtype=torch.float32)
+
+    for expert in range(num_experts):
+        w1_e = w1[expert]
+        w2_e = w2[expert]
+        w1_amax = torch.abs(w1_e).max().to(torch.float32)
+        w2_amax = torch.abs(w2_e).max().to(torch.float32)
+        w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+        w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+
+        w1_fp4[expert], w1_blockscale[expert] = ops.scaled_fp4_quant(
+            w1_e, w1_gs[expert]
+        )
+
+        w2_fp4[expert], w2_blockscale[expert] = ops.scaled_fp4_quant(
+            w2_e, w2_gs[expert]
+        )
+
+    def run_triton_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_fp8_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
+            )
+
+    def run_cutlass_moe_fp4(
+        a: torch.Tensor,
+        w1_fp4: torch.Tensor,
+        w2_fp4: torch.Tensor,
+        w1_blockscale: torch.Tensor,
+        w2_blockscale: torch.Tensor,
+        w1_gs: torch.Tensor,
+        w2_gs: torch.Tensor,
+        a1_gs: torch.Tensor,
+        a2_gs: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        m: int,
+        n: int,
+        k: int,
+        e: int,
+        device: torch.device,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            with nvtx.annotate("cutlass_moe_fp4", color="green"):
+                cutlass_moe_fp4(
+                    a=a,
+                    a1_gscale=a1_gs,
+                    a2_gscale=a2_gs,
+                    w1_fp4=w1_fp4,
+                    w1_blockscale=w1_blockscale,
+                    w1_alphas=w1_gs,
+                    w2_fp4=w2_fp4,
+                    w2_blockscale=w2_blockscale,
+                    w2_alphas=w2_gs,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    m=m,
+                    n=n,
+                    k=k,
+                    e=num_experts,
+                    device=device,
+                )
+
+    def run_cutlass_from_graph(
+        a: torch.Tensor,
+        a1_gscale: torch.Tensor,
+        w1_fp4: torch.Tensor,
+        w1_blockscale: torch.Tensor,
+        w1_alphas: torch.Tensor,
+        a2_gscale: torch.Tensor,
+        w2_fp4: torch.Tensor,
+        w2_blockscale: torch.Tensor,
+        w2_alphas: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        m: int,
+        n: int,
+        k: int,
+        e: int,
+        device: torch.device,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return cutlass_moe_fp4(
+                a=a,
+                a1_gscale=a1_gs,
+                w1_fp4=w1_fp4,
+                w1_blockscale=w1_blockscale,
+                w1_alphas=w1_alphas,
+                a2_gscale=a2_gs,
+                w2_fp4=w2_fp4,
+                w2_blockscale=w2_blockscale,
+                w2_alphas=w2_alphas,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                m=m,
+                n=n,
+                k=k,
+                e=num_experts,
+                device=device,
+            )
+
+    def run_triton_from_graph(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_fp8_scale: torch.Tensor,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
+            )
+
+    def replay_graph(graph, num_repeats):
+        for _ in range(num_repeats):
+            graph.replay()
+        torch.cuda.synchronize()
+
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        run_cutlass_from_graph(
+            a=a,
+            a1_gscale=a1_gs,
+            w1_fp4=w1_fp4,
+            w1_blockscale=w1_blockscale,
+            w1_alphas=w1_gs,
+            a2_gscale=a2_gs,
+            w2_fp4=w2_fp4,
+            w2_blockscale=w2_blockscale,
+            w2_alphas=w2_gs,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            m=m,
+            n=n,
+            k=k,
+            e=num_experts,
+            device=device,
+        )
+    torch.cuda.synchronize()
+
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        run_triton_from_graph(
+            a,
+            w1_fp8q_notransp,
+            w2_fp8q_notransp,
+            topk_weights,
+            topk_ids,
+            w1_fp8scale,
+            w2_fp8scale,
+            a_fp8_scale,
+        )
+    torch.cuda.synchronize()
+
+    min_run_time = 5
+    num_warmup = 5
+    num_runs = 25
+
+    globals = {
+        # Baseline params
+        "w1": w1,
+        "w2": w2,
+        "score": score,
+        "topk": topk,
+        "w1_fp8q_notransp": w1_fp8q_notransp,
+        "w2_fp8q_notransp": w2_fp8q_notransp,
+        "w1_fp8scale": w1_fp8scale,
+        "w2_fp8scale": w2_fp8scale,
+        "a_fp8_scale": a_fp8_scale,
+        # Cutlass params
+        "a": a,
+        "a1_gscale": a1_gs,
+        "w1_fp4": w1_fp4,
+        "w1_blockscale": w1_blockscale,
+        "w1_alphas": w1_gs,
+        "a2_gscale": a2_gs,
+        "w2_fp4": w2_fp4,
+        "w2_blockscale": w2_blockscale,
+        "w2_alphas": w2_gs,
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "m": m,
+        "n": n,
+        "k": k,
+        "e": num_experts,
+        "device": device,
+        # cuda graph params
+        "cutlass_graph": cutlass_graph,
+        "triton_graph": triton_graph,
+        # Gen params
+        "num_runs": num_runs,
+        # Kernels
+        "run_triton_moe": run_triton_moe,
+        "run_cutlass_moe_fp4": run_cutlass_moe_fp4,
+        "replay_graph": replay_graph,
+    }
+
+    # Warmup
+    run_triton_moe(
+        a,
+        w1_fp8q_notransp,
+        w2_fp8q_notransp,
+        topk_weights,
+        topk_ids,
+        w1_fp8scale,
+        w2_fp8scale,
+        a_fp8_scale,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(triton_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(triton_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+
+    run_cutlass_moe_fp4(
+        a,
+        w1_fp4,
+        w2_fp4,
+        w1_blockscale,
+        w2_blockscale,
+        w1_gs,
+        w2_gs,
+        a1_gs,
+        a2_gs,
+        topk_weights,
+        topk_ids,
+        m,
+        n,
+        k,
+        num_experts,
+        device,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_alphas, w2_alphas, a1_gscale, a2_gscale, topk_weights, topk_ids, m, n, k, e, device, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="cutlass_moe_fp4",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(cutlass_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(cutlass_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="cutlass_moe_fp4_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in PER_ACT_TOKEN_OPTS:
+                    for per_out_ch in PER_OUT_CH_OPTS:
+                        for size_m in args.batch_sizes:
+                            mkn = (size_m, size_k, size_n)
+                            bench_run(
+                                results,
+                                model,
+                                num_experts,
+                                topk,
+                                per_act_token,
+                                per_out_ch,
+                                mkn,
+                            )
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark NVFP4 CUTLASS MOE across specified models/shapes/batches"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -0,0 +1,416 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES_MOE
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    cutlass_moe_fp8,
+    fused_experts,
+    fused_topk,
+)
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = [
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
+    "nm-testing/deepseekv2-lite",
+    "ibm-granite/granite-3.0-1b-a400m",
+    "ibm-granite/granite-3.0-3b-a800m",
+]
+DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False]
+PER_OUT_CH_OPTS = [False]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
+
+
+def bench_run(
+    results: list[benchmark.Measurement],
+    model: str,
+    num_experts: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    mkn: tuple[int, int, int],
+):
+    label = "Quant Matmul"
+
+    sub_label = (
+        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format(
+            model, num_experts, topk, per_act_token, per_out_ch, mkn
+        )
+    )
+
+    print(f"Testing: {sub_label}")
+
+    (m, k, n) = mkn
+
+    dtype = torch.half
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
+
+    _, a_scale = ops.scaled_fp8_quant(a)
+
+    w1_q = torch.empty(
+        (num_experts, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn
+    )
+    w2_q = torch.empty((num_experts, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+    w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
+    w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
+
+    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
+    for expert in range(num_experts):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
+    w1_q_notransp = w1_q.clone()
+    w2_q_notransp = w2_q.clone()
+    w1_q = w1_q.transpose(1, 2)
+    w2_q = w2_q.transpose(1, 2)
+
+    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, renormalize=False
+    )
+
+    def run_triton_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_scale,
+            )
+
+    def run_cutlass_moe(
+        a: torch.Tensor,
+        a_scale: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        c_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides2: torch.Tensor,
+        num_repeats: int,
+    ):
+        for _ in range(num_repeats):
+            cutlass_moe_fp8(
+                a,
+                w1,
+                w2,
+                w1_scale,
+                w2_scale,
+                topk_weights,
+                topk_ids,
+                ab_strides1,
+                c_strides1,
+                ab_strides2,
+                c_strides2,
+                a1_scale=a_scale,
+            )
+
+    def run_cutlass_from_graph(
+        a: torch.Tensor,
+        a_scale: torch.Tensor,
+        w1_q: torch.Tensor,
+        w2_q: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        c_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides2: torch.Tensor,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return cutlass_moe_fp8(
+                a,
+                w1_q,
+                w2_q,
+                w1_scale,
+                w2_scale,
+                topk_weights,
+                topk_ids,
+                ab_strides1,
+                c_strides1,
+                ab_strides2,
+                c_strides2,
+                a1_scale=a_scale,
+            )
+
+    def run_triton_from_graph(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a_scale: torch.Tensor,
+    ):
+        with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+        ):
+            return fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_scale,
+            )
+
+    def replay_graph(graph, num_repeats):
+        for _ in range(num_repeats):
+            graph.replay()
+        torch.cuda.synchronize()
+
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        run_cutlass_from_graph(
+            a,
+            a_scale,
+            w1_q,
+            w2_q,
+            w1_scale,
+            w2_scale,
+            topk_weights,
+            topk_ids,
+            ab_strides1,
+            c_strides1,
+            ab_strides2,
+            c_strides2,
+        )
+    torch.cuda.synchronize()
+
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        run_triton_from_graph(
+            a,
+            w1_q_notransp,
+            w2_q_notransp,
+            topk_weights,
+            topk_ids,
+            w1_scale,
+            w2_scale,
+            a_scale,
+        )
+    torch.cuda.synchronize()
+
+    min_run_time = 5
+    num_warmup = 5
+    num_runs = 25
+
+    globals = {
+        # Baseline params
+        "w1": w1,
+        "w2": w2,
+        "score": score,
+        "topk": topk,
+        "w1_q_notransp": w1_q_notransp,
+        "w2_q_notransp": w2_q_notransp,
+        # Cutlass params
+        "a_scale": a_scale,
+        "w1_q": w1_q,
+        "w2_q": w2_q,
+        "w1_scale": w1_scale,
+        "w2_scale": w2_scale,
+        "ab_strides1": ab_strides1,
+        "c_strides1": c_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides2": c_strides2,
+        # cuda graph params
+        "cutlass_graph": cutlass_graph,
+        "triton_graph": triton_graph,
+        # Gen params
+        "a": a,
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "num_runs": num_runs,
+        # Kernels
+        "run_triton_moe": run_triton_moe,
+        "run_cutlass_moe": run_cutlass_moe,
+        "replay_graph": replay_graph,
+    }
+
+    # Warmup
+    run_triton_moe(
+        a,
+        w1_q_notransp,
+        w2_q_notransp,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        w2_scale,
+        a_scale,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(triton_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(triton_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    run_cutlass_moe(
+        a,
+        a_scale,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids,
+        ab_strides1,
+        c_strides1,
+        ab_strides2,
+        c_strides2,
+        num_warmup,
+    )
+
+    results.append(
+        benchmark.Timer(
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+    # Warmup
+    replay_graph(cutlass_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(cutlass_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in PER_ACT_TOKEN_OPTS:
+                    for per_out_ch in PER_OUT_CH_OPTS:
+                        for size_m in DEFAULT_BATCH_SIZES:
+                            mkn = (size_m, size_k, size_n)
+                            bench_run(
+                                results,
+                                model,
+                                num_experts,
+                                topk,
+                                per_act_token,
+                                per_out_ch,
+                                mkn,
+                            )
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
--- a/Show More
+++ b/Show More