Initial stubs for P/D scheduling changes

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-10-25 18:17:48 +08:00 · 2025-04-18 16:42:49 -04:00
2683 changed files with 129759 additions and 328333 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,20 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 800 MiB quota, please use it wisely.
+# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/6326 .
+# See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 def print_top_10_largest_files(zip_file):
    """Print the top 10 largest files in the given zip file."""
-    with zipfile.ZipFile(zip_file, "r") as z:
+    with zipfile.ZipFile(zip_file, 'r') as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
@ -29,18 +28,14 @@ def check_wheel_size(directory):
                wheel_path = os.path.join(root, file_name)
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(
+                    print(f"Not allowed: Wheel {wheel_path} is larger "
-                        f"Not allowed: Wheel {wheel_path} is larger "
+                          f"({wheel_size_mb:.2f} MB) than the limit "
-                        f"({wheel_size_mb:.2f} MB) than the limit "
+                          f"({VLLM_MAX_SIZE_MB} MB).")
                        f"({VLLM_MAX_SIZE_MB} MB)."
                    )
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
-                    print(
+                    print(f"Wheel {wheel_path} is within the allowed size "
-                        f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb:.2f} MB).")
                        f"({wheel_size_mb:.2f} MB)."
                    )
    return 0
@ -50,4 +45,4 @@ if __name__ == "__main__":
        sys.exit(1)
    directory = sys.argv[1]
-    sys.exit(check_wheel_size(directory))
+    sys.exit(check_wheel_size(directory))
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import os
@ -8,8 +7,7 @@ template = """<!DOCTYPE html>
 <html>
    <body>
    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
    </body>
 </html>
 """
@ -22,25 +20,7 @@ filename = os.path.basename(args.wheel)
 with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
    if "x86_64" in filename:
        x86_wheel = filename
        arm_wheel = filename.replace("x86_64", "aarch64").replace(
            "manylinux1", "manylinux2014"
        )
    elif "aarch64" in filename:
        x86_wheel = filename.replace("aarch64", "x86_64").replace(
            "manylinux2014", "manylinux1"
        )
        arm_wheel = filename
    else:
        raise ValueError(f"Unsupported wheel: {filename}")
    # cloudfront requires escaping the '+' character
    f.write(
-        template.format(
+        template.format(wheel=filename,
-            x86_wheel=x86_wheel,
+                        wheel_html_escaped=filename.replace("+", "%2B")))
            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
            arm_wheel=arm_wheel,
            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
        )
    )
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,4 +1,3 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.54
+    value: 0.419
  - name: "exact_match,flexible-extract"
-    value: 0.59
+    value: 0.416
-limit: 1319
+limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.335
  - name: "exact_match,flexible-extract"
    value: 0.323
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -1,12 +1,11 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.30
+    value: 0.31
  - name: "exact_match,flexible-extract"
-    value: 0.465
+    value: 0.47
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.47
  - name: "exact_match,flexible-extract"
    value: 0.64
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,6 +1,10 @@
-Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -1,44 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 import pytest
 def pytest_addoption(parser):
    parser.addoption(
        "--config-list-file",
        action="store",
        help="Path to the file listing model config YAMLs (one per line)",
    )
    parser.addoption(
        "--tp-size",
        action="store",
        default="1",
        help="Tensor parallel size to use for evaluation",
    )
@pytest.fixture(scope="session")
 def config_list_file(pytestconfig, config_dir):
    rel_path = pytestconfig.getoption("--config-list-file")
    return config_dir / rel_path
@pytest.fixture(scope="session")
 def tp_size(pytestconfig):
    return pytestconfig.getoption("--tp-size")
 def pytest_generate_tests(metafunc):
    if "config_filename" in metafunc.fixturenames:
        rel_path = metafunc.config.getoption("--config-list-file")
        config_list_file = Path(rel_path).resolve()
        config_dir = config_list_file.parent
        with open(config_list_file, encoding="utf-8") as f:
            configs = [
                config_dir / line.strip()
                for line in f
                if line.strip() and not line.startswith("#")
            ]
        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install lm-eval==0.4.4
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install lm-eval==0.4.4
 usage() {
    echo``
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 lm_eval --model vllm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -0,0 +1,59 @@
 #!/bin/bash
 usage() {
    echo``
    echo "Runs lm eval harness on GSM8k using vllm and compares to "
    echo "precomputed baseline (measured by HF transformers.)"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
    echo "  -t    - tensor parallel size"
    echo
 }
 SUCCESS=0
 while getopts "c:t:" OPT; do
  case ${OPT} in
    c ) 
        CONFIG="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? )
        usage
        exit 1
        ;;
  esac
 done
 # Parse list of configs.
 IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
    LOCAL_SUCCESS=0
    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
    export LM_EVAL_TP_SIZE=$TP_SIZE
    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
    if [[ $LOCAL_SUCCESS == 0 ]]; then
        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
    else
        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
    fi
    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
 done
 if [ "${SUCCESS}" -eq "0" ]; then
    exit 0
 else
    exit 1
 fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,57 +1,69 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
-pytest -s -v test_lm_eval_correctness.py \
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-    --config-list-file=configs/models-small.txt \
+* export LM_EVAL_TP_SIZE=4 
-    --tp-size=1
+* pytest -s test_lm_eval_correctness.py
 """
 import os
 from pathlib import Path
 import lm_eval
-import numpy as np
+import numpy
 import pytest
 import yaml
-RTOL = 0.08
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
 TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
-def launch_lm_eval(eval_config, tp_size):
+def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get("trust_remote_code", False)
+    trust_remote_code = eval_config.get('trust_remote_code', False)
-    max_model_len = eval_config.get("max_model_len", 4096)
+
-    model_args = (
+    model_args = f"pretrained={eval_config['model_name']}," \
-        f"pretrained={eval_config['model_name']},"
+                 f"tensor_parallel_size={TP_SIZE}," \
-        f"tensor_parallel_size={tp_size},"
+                 f"add_bos_token=true," \
-        f"enforce_eager=true,"
+                 f"trust_remote_code={trust_remote_code}"
-        f"add_bos_token=true,"
+
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len}"
    )
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto",
+        batch_size="auto")
-    )
+
    return results
-def test_lm_eval_correctness_param(config_filename, tp_size):
+def test_lm_eval_correctness():
-    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+    eval_config = yaml.safe_load(
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-    results = launch_lm_eval(eval_config, tp_size)
+    if eval_config[
            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
        pytest.skip("FBGEMM is currently failing on main.")
    # Launch eval requests.
    results = launch_lm_eval(eval_config)
    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
-            print(
+            print(f'{task["name"]} | {metric["name"]}: '
-                f"{task['name']} | {metric['name']}: "
+                  f'ground_truth={ground_truth} | measured={measured_value}')
-                f"ground_truth={ground_truth} | measured={measured_value}"
+            success = success and numpy.isclose(
-            )
+                ground_truth, measured_value, rtol=RTOL)
            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -7,11 +7,11 @@ This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 ## Performance benchmark quick overview
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
 **Benchmarking Duration**: about 1hr.
@ -28,34 +28,16 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName
 ## Trigger the benchmark
 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 Manually Trigger the benchmark
 ```bash
 bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 Runtime environment variables:
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 ## Performance benchmark details
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
-> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+
 >
 ### Latency test
 Here is an example of one test inside `latency-tests.json`:
@ -78,7 +60,7 @@ Here is an example of one test inside `latency-tests.json`:
 In this example:
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@ -86,13 +68,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
 ### Throughput test
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 ### Serving test
-We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 ```json
 [
@ -104,6 +86,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
@ -121,8 +104,8 @@ Inside this example:
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
+- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
@ -130,29 +113,12 @@ WARNING: The benchmarking script will save json results by itself, so please do
 ### Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
 `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
 |----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
 | 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
 | 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
 A comparison diagram will be generated below the table.
 Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
 ## Nightly test details
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
@ -160,9 +126,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
 ### Workflow
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
+- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
+- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 ### Nightly tests
@ -172,6 +138,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -1,4 +1,3 @@
 # Nightly benchmark annotation
 ## Description
@ -14,15 +13,15 @@ Please download the visualization scripts in the post
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
-    - Download `nightly-benchmarks.zip`.
+  - Download `nightly-benchmarks.zip`.
-    - In the same folder, run the following code:
+  - In the same folder, run the following code:
-    ```bash
+  ```console
-    export HF_TOKEN=<your HF token>
+  export HF_TOKEN=<your HF token>
-    apt update
+  apt update
-    apt install -y git
+  apt install -y git
-    unzip nightly-benchmarks.zip
+  unzip nightly-benchmarks.zip
-    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-    ```
+  ```
 And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
 ## Setup
 - Docker images:
-    - vLLM: `vllm/vllm-openai:v0.6.2`
+  - vLLM: `vllm/vllm-openai:v0.6.2`
-    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
+    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
-    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
-    - 8x Nvidia A100 GPUs
+  - 8x Nvidia A100 GPUs
 - Workload:
-    - Dataset
+  - Dataset
-        - ShareGPT dataset
+    - ShareGPT dataset
-        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-    - Models: llama-3 8B, llama-3 70B.
+  - Models: llama-3 8B, llama-3 70B.
-        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 ## Known issues
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -1,12 +1,10 @@
 # Performance benchmarks descriptions
 ## Latency tests
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 {latency_tests_markdown_table}
@ -16,8 +14,7 @@
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput.
 {throughput_tests_markdown_table}
@ -28,18 +25,12 @@
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
+- We also added a speculative decoding test for llama-3 70B, under QPS 2
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 {serving_tests_markdown_table}
 ## Platform Information
 {platform_markdown_table}
 ## json version of the benchmarking tables
 This section contains the data of the markdown tables above in JSON format.
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@ -1,307 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
 from importlib import util
 import pandas as pd
 plotly_found = util.find_spec("plotly.express") is not None
 def compare_data_columns(
    files, name_column, data_column, info_cols, drop_column, debug=False
 ):
    """
    Align concatenation by keys derived from info_cols instead of row order.
    - Pick one canonical key list: subset of info_cols present in ALL files.
    - For each file: set index to those keys, aggregate duplicates
    - (mean for metric, first for names).
    - Concat along axis=1 (indexes align), then reset_index so callers can
    - group by columns.
    - If --debug, add a <file_label>_name column per file.
    """
    print("\ncompare_data_column:", data_column)
    frames = []
    raw_data_cols = []
    compare_frames = []
    # 1) choose a canonical key list from info_cols that exists in ALL files
    cols_per_file = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
        cols_per_file.append(set(df_tmp.columns))
    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
    if not key_cols:
        # soft fallback: use any info_cols present in the first file
        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
    if not key_cols:
        raise ValueError(
            "No common key columns found from info_cols across the input files."
        )
    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
    meta_added = False
    for file in files:
        df = pd.read_json(file, orient="records")
        # Keep rows that actually have the compared metric (same as original behavior)
        if drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)
        # Stabilize numeric key columns (harmless if missing)
        for c in (
            "Input Len",
            "Output Len",
            "TP Size",
            "PP Size",
            "# of max concurrency.",
            "qps",
        ):
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
        # Ensure all key columns exist
        for c in key_cols:
            if c not in df.columns:
                df[c] = pd.NA
        # Set index = key_cols and aggregate duplicates → unique MultiIndex
        df_idx = df.set_index(key_cols, drop=False)
        # meta (key columns), unique per key
        meta = df_idx[key_cols]
        if not meta.index.is_unique:
            meta = meta.groupby(level=key_cols, dropna=False).first()
        # metric series for this file, aggregated to one row per key
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
        s = df_idx[data_column]
        if not s.index.is_unique:
            s = s.groupby(level=key_cols, dropna=False).mean()
        s.name = file_label  # column label like original
        # add meta once (from first file) so keys are the leftmost columns
        if not meta_added:
            frames.append(meta)
            meta_added = True
        # (NEW) debug: aligned test-name column per file
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
            frames.append(name_s)
        frames.append(s)
        raw_data_cols.append(file_label)
        compare_frames.append(s)
        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
            ratio = current / base
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
    # 4) concat on columns with aligned MultiIndex;
    # then reset_index to return keys as columns
    concat_df = pd.concat(frames, axis=1)
    concat_df = concat_df.reset_index(drop=True).reset_index()
    if "index" in concat_df.columns:
        concat_df = concat_df.drop(columns=["index"])
    # Ensure key/info columns appear first (in your info_cols order)
    front = [c for c in info_cols if c in concat_df.columns]
    rest = [c for c in concat_df.columns if c not in front]
    concat_df = concat_df[front + rest]
    print(raw_data_cols)
    return concat_df, raw_data_cols
 def split_json_by_tp_pp(
    input_file: str = "benchmark_results.json", output_root: str = "."
 ) -> list[str]:
    """
    Split a benchmark JSON into separate folders by (TP Size, PP Size).
    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
    Returns: list of file paths written.
    """
    # Load JSON data into DataFrame
    with open(input_file, encoding="utf-8") as f:
        data = json.load(f)
    # If the JSON is a dict with a list under common keys, use that list
    if isinstance(data, dict):
        for key in ("results", "serving_results", "benchmarks", "data"):
            if isinstance(data.get(key), list):
                data = data[key]
                break
    df = pd.DataFrame(data)
    # Keep only "serving" tests
    name_col = next(
        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
    )
    if name_col:
        df = df[
            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
        ].copy()
    # Handle alias column names
    rename_map = {
        "tp_size": "TP Size",
        "tensor_parallel_size": "TP Size",
        "pp_size": "PP Size",
        "pipeline_parallel_size": "PP Size",
    }
    df.rename(
        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
    )
    # Ensure TP/PP columns exist (default to 1 if missing)
    if "TP Size" not in df.columns:
        df["TP Size"] = 1
    if "PP Size" not in df.columns:
        df["PP Size"] = 1
    # make sure TP/PP are numeric ints with no NaN
    df["TP Size"] = (
        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
    )
    df["PP Size"] = (
        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
    )
    # Split into separate folders
    saved_paths: list[str] = []
    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
        os.makedirs(folder_name, exist_ok=True)
        filepath = os.path.join(folder_name, "benchmark_results.json")
        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
        print(f"Saved: {filepath}")
        saved_paths.append(filepath)
    return saved_paths
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", action="append", type=str, help="input file name"
    )
    parser.add_argument(
        "--debug", action="store_true", help="show all information for debugging"
    )
    parser.add_argument(
        "--plot",
        action=argparse.BooleanOptionalAction,
        default=True,
        help="plot perf diagrams or not --no-plot --plot",
    )
    parser.add_argument(
        "-x",
        "--xaxis",
        type=str,
        default="# of max concurrency.",
        help="column name to use as X Axis in comparison graph",
    )
    args = parser.parse_args()
    drop_column = "P99"
    name_column = "Test name"
    info_cols = [
        "Model",
        "Dataset Name",
        "Input Len",
        "Output Len",
        "TP Size",
        "PP Size",
        "# of max concurrency.",
        "qps",
    ]
    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
    html_msgs_for_data_cols = [
        "Compare Output Tokens /n",
        "Median TTFT /n",
        "Median TPOT /n",
    ]
    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
    else:
        files = args.file
    print("comparing : " + ", ".join(files))
    debug = args.debug
    plot = args.plot
    # For Plot feature, assign y axis from one of info_cols
    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
    with open("perf_comparison.html", "w") as text_file:
        for i in range(len(data_cols_to_compare)):
            output_df, raw_data_cols = compare_data_columns(
                files,
                name_column,
                data_cols_to_compare[i],
                info_cols,
                drop_column,
                debug=debug,
            )
            # For Plot feature, insert y axis from one of info_cols
            raw_data_cols.insert(0, info_cols[y_axis_index])
            filtered_info_cols = info_cols[:-2]
            existing_group_cols = [
                c for c in filtered_info_cols if c in output_df.columns
            ]
            if not existing_group_cols:
                raise ValueError(
                    f"No valid group-by columns  "
                    f"Expected subset: {filtered_info_cols}, "
                    f"but DataFrame has: {list(output_df.columns)}"
                )
            output_df_sorted = output_df.sort_values(by=existing_group_cols)
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
            for name, group in output_groups:
                html = group.to_html()
                text_file.write(html_msgs_for_data_cols[i])
                text_file.write(html)
                if plot and plotly_found:
                    import plotly.express as px
                    df = group[raw_data_cols]
                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
                    # Melt DataFrame for plotting
                    df_melted = df_sorted.melt(
                        id_vars=info_cols[y_axis_index],
                        var_name="Configuration",
                        value_name=data_cols_to_compare[i],
                    )
                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
                    # Create Plotly line chart
                    fig = px.line(
                        df_melted,
                        x=info_cols[y_axis_index],
                        y=data_cols_to_compare[i],
                        color="Configuration",
                        title=title,
                        markers=True,
                    )
                    # Export to HTML
                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,19 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
 import shlex
 from importlib import util
 from pathlib import Path
 from typing import Any
 import pandas as pd
 import psutil
 import regex as re
 from tabulate import tabulate
 results_folder = Path("results/")
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
@ -33,39 +28,28 @@ throughput_results = []
 throughput_results_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
-    "num_requests": "# of req.",
+    # "num_requests": "# of req.",
-    "total_num_tokens": "Total # of tokens",
+    # "total_num_tokens": "Total # of tokens",
-    "elapsed_time": "Elapsed time (s)",
+    # "elapsed_time": "Elapsed time (s)",
    "requests_per_second": "Tput (req/s)",
-    "tokens_per_second": "Tput (tok/s)",
+    # "tokens_per_second": "Tput (tok/s)",
 }
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "model_id": "Model",
    "dataset_name": "Dataset Name",
    "input_len": "Input Len",
    "output_len": "Output Len",
    "tp_size": "TP Size",
    "pp_size": "PP Size",
    "dtype": "dtype",
    "gpu_type": "GPU",
-    "completed": "# of req.",
+    # "completed": "# of req.",
    "qps": "qps",
    "max_concurrency": "# of max concurrency.",
    "request_throughput": "Tput (req/s)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
+    # "input_throughput": "Input Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
    # "total_input_tokens": "Total input tokens",
    # "total_output_tokens": "Total output tokens",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
-    "median_tpot_ms": "Median",
+    # "median_tpot_ms": "Median",
-    "p99_tpot_ms": "P99",
+    # "p99_tpot_ms": "P99",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
@ -81,134 +65,24 @@ def read_markdown(file):
 def results_to_json(latency, throughput, serving):
-    return json.dumps(
+    return json.dumps({
-        {
+        'latency': latency.to_dict(),
-            "latency": latency.to_dict(),
+        'throughput': throughput.to_dict(),
-            "throughput": throughput.to_dict(),
+        'serving': serving.to_dict()
-            "serving": serving.to_dict(),
+    })
        }
    )
 def get_size_with_unit(bytes, suffix="B"):
    """
    Scale bytes to its proper format
    e.g:
        1253656 => '1.20MB'
        1253656678 => '1.17GB'
    """
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
 def _coerce(val: str) -> Any:
    """Best-effort type coercion from string to Python types."""
    low = val.lower()
    if low == "null":
        return None
    if low == "true":
        return True
    if low == "false":
        return False
    # integers
    if re.fullmatch(r"[+-]?\d+", val):
        try:
            return int(val)
        except ValueError:
            pass
    # floats (keep 'inf'/'-inf'/'nan' as strings)
    if re.fullmatch(r"[+-]?\d*\.\d+", val):
        try:
            return float(val)
        except ValueError:
            pass
    return val
 def parse_client_command(cmd: str) -> dict[str, Any]:
    """Parse the client_command shell string into {executable, script, args}."""
    toks = shlex.split(cmd)
    if len(toks) < 2:
        raise ValueError("client_command must include an executable and a script")
    executable, script = toks[0], toks[1]
    args: dict[str, Any] = {}
    i = 2
    while i < len(toks):
        t = toks[i]
        if t.startswith("--"):
            # --key=value or --key (value) or boolean flag
            if "=" in t:
                key, val = t.split("=", 1)
                if key == "--metadata":
                    md = {}
                    if val:
                        if "=" in val:
                            k, v = val.split("=", 1)
                            md[k] = _coerce(v)
                        else:
                            md[val] = True
                    args[key] = md
                else:
                    args[key] = _coerce(val)
                i += 1
                continue
            key = t
            # Special: consume metadata k=v pairs until next --flag
            if key == "--metadata":
                i += 1
                md = {}
                while i < len(toks) and not toks[i].startswith("--"):
                    pair = toks[i]
                    if "=" in pair:
                        k, v = pair.split("=", 1)
                        md[k] = _coerce(v)
                    else:
                        md[pair] = True
                    i += 1
                args[key] = md
                continue
            # Standard: check if next token is a value (not a flag)
            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
                args[key] = _coerce(toks[i + 1])
                i += 2
            else:
                # lone flag -> True
                args[key] = True
                i += 1
        else:
            # unexpected positional; skip
            i += 1
    return {"executable": executable, "script": script, "args": args}
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+
    parser.add_argument(
        "-r",
        "--result",
        type=str,
        default="results",
        help="Folder name for benchmark output results.",
    )
    args = parser.parse_args()
    results_folder = Path(args.result)
    if not results_folder.exists():
        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
-            # this result is generated via `vllm bench serve` command
+            # this result is generated via `benchmark_serving.py`
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
@ -216,50 +90,18 @@ if __name__ == "__main__":
            except OSError as e:
                print(e)
                continue
            # Parse Server Command Arg
            out: dict[str, Any] = {
                "server_command": parse_client_command(command["server_command"])
            }
            parse_args = [
                "--tensor-parallel-size",
                "--pipeline-parallel-size",
                "--dtype",
            ]
            col_mapping = ["tp_size", "pp_size", "dtype"]
            for index, arg in enumerate(parse_args):
                if arg in out["server_command"]["args"]:
                    raw_result.update(
                        {col_mapping[index]: out["server_command"]["args"][arg]}
                    )
            # Parse Client Command Arg
            out: dict[str, Any] = {
                "client_command": parse_client_command(command["client_command"])
            }
            parse_args = [
                "--dataset-name",
                "--random-input-len",
                "--random-output-len",
                "--request-rate",
            ]
            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
            for index, arg in enumerate(parse_args):
                if arg in out["client_command"]["args"]:
                    raw_result.update(
                        {col_mapping[index]: out["client_command"]["args"][arg]}
                    )
            # Add Server, Client command
            raw_result.update(command)
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            serving_results.append(raw_result)
            continue
        elif "latency" in f.name:
-            # this result is generated via `vllm bench latency` command
+            # this result is generated via `benchmark_latency.py`
            # attach the benchmarking command to raw_result
            try:
@ -278,8 +120,7 @@ if __name__ == "__main__":
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
            # add the result to raw_result
@ -287,7 +128,7 @@ if __name__ == "__main__":
            continue
        elif "throughput" in f.name:
-            # this result is generated via `vllm bench throughput` command
+            # this result is generated via `benchmark_throughput.py`
            # attach the benchmarking command to raw_result
            try:
@ -312,51 +153,26 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)
-    svmem = psutil.virtual_memory()
+    raw_results_json = results_to_json(latency_results, throughput_results,
-    platform_data = {
+                                       serving_results)
        "Physical cores": [psutil.cpu_count(logical=False)],
        "Total cores": [psutil.cpu_count(logical=True)],
        "Total Memory": [get_size_with_unit(svmem.total)],
    }
    if util.find_spec("numa") is not None:
        from numa import info
        platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
    if util.find_spec("cpuinfo") is not None:
        from cpuinfo import get_cpu_info
        platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
    platform_results = pd.DataFrame.from_dict(
        platform_data, orient="index", columns=["Platform Info"]
    )
    raw_results_json = results_to_json(
        latency_results, throughput_results, serving_results
    )
    # remapping the key, for visualization purpose
    if not latency_results.empty:
-        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
+        latency_results = latency_results[list(
-            columns=latency_column_mapping
+            latency_column_mapping.keys())].rename(
-        )
+                columns=latency_column_mapping)
    if not serving_results.empty:
-        valid_columns = [
+        serving_results = serving_results[list(
-            col for col in serving_column_mapping if col in serving_results.columns
+            serving_column_mapping.keys())].rename(
-        ]
+                columns=serving_column_mapping)
        serving_results = serving_results[valid_columns].rename(
            columns=serving_column_mapping
        )
    if not throughput_results.empty:
-        throughput_results = throughput_results[
+        throughput_results = throughput_results[list(
-            list(throughput_results_column_mapping.keys())
+            throughput_results_column_mapping.keys())].rename(
-        ].rename(columns=throughput_results_column_mapping)
+                columns=throughput_results_column_mapping)
-    processed_results_json = results_to_json(
+    processed_results_json = results_to_json(latency_results,
-        latency_results, throughput_results, serving_results
+                                             throughput_results,
-    )
+                                             serving_results)
    for df in [latency_results, serving_results, throughput_results]:
        if df.empty:
@ -368,45 +184,38 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
        )
    # get markdown tables
-    latency_md_table = tabulate(
+    latency_md_table = tabulate(latency_results,
-        latency_results, headers="keys", tablefmt="pipe", showindex=False
+                                headers='keys',
-    )
+                                tablefmt='pipe',
-    serving_md_table = tabulate(
+                                showindex=False)
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    serving_md_table = tabulate(serving_results,
-    )
+                                headers='keys',
-    throughput_md_table = tabulate(
+                                tablefmt='pipe',
-        throughput_results, headers="keys", tablefmt="pipe", showindex=False
+                                showindex=False)
-    )
+    throughput_md_table = tabulate(throughput_results,
-    platform_md_table = tabulate(
+                                   headers='keys',
-        platform_results, headers="keys", tablefmt="pipe", showindex=True
+                                   tablefmt='pipe',
-    )
+                                   showindex=False)
    # document the result
-    md_file = "benchmark_results.md"
+    with open(results_folder / "benchmark_results.md", "w") as f:
-    json_file = "benchmark_results.json"
+
-    with open(results_folder / md_file, "w") as f:
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-        results = read_markdown(
+                                "performance-benchmarks-descriptions.md")
            "../.buildkite/nightly-benchmarks/"
            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
-            platform_markdown_table=platform_md_table,
+            benchmarking_results_in_json_string=processed_results_json)
            benchmarking_results_in_json_string=processed_results_json,
        )
        f.write(results)
    # document benchmarking results in json
-    with open(results_folder / json_file, "w") as f:
+    with open(results_folder / "benchmark_results.json", "w") as f:
-        results = (
+
-            latency_results.to_dict(orient="records")
+        results = latency_results.to_dict(
-            + throughput_results.to_dict(orient="records")
+            orient='records') + throughput_results.to_dict(
-            + serving_results.to_dict(orient="records")
+                orient='records') + serving_results.to_dict(orient='records')
        )
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
@ -15,12 +14,15 @@ def main(model, cachedir):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer"
+        description="Download and save Hugging Face tokenizer")
-    )
+    parser.add_argument("--model",
-    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+                        type=str,
-    parser.add_argument(
+                        required=True,
-        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
+                        help="Name of the model")
-    )
+    parser.add_argument("--cachedir",
                        type=str,
                        required=True,
                        help="Directory to save the tokenizer")
    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
@ -12,33 +11,33 @@ from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
-        description="Parse command line arguments for summary-nightly-results script."
+        description=
-    )
+        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument(
+    parser.add_argument('--results-folder',
-        "--results-folder",
+                        type=str,
-        type=str,
+                        required=True,
-        required=True,
+                        help='The folder where the results are stored.')
-        help="The folder where the results are stored.",
+    parser.add_argument('--description',
-    )
+                        type=str,
-    parser.add_argument(
+                        required=True,
-        "--description", type=str, required=True, help="Description of the results."
+                        help='Description of the results.')
    )
    args = parser.parse_args()
    return args
 def get_perf(df, method, model, metric):
    means = []
    for qps in [2, 4, 8, 16, "inf"]:
-        target = df["Test name"].str.contains(model)
+        target = df['Test name'].str.contains(model)
-        target = target & df["Engine"].str.contains(method)
+        target = target & df['Engine'].str.contains(method)
-        target = target & df["Test name"].str.contains("qps_" + str(qps))
+        target = target & df['Test name'].str.contains("qps_" + str(qps))
        filtered_df = df[target]
        if filtered_df.empty:
-            means.append(0.0)
+            means.append(0.)
        else:
            means.append(filtered_df[metric].values[0])
@ -46,6 +45,7 @@ def get_perf(df, method, model, metric):
 def get_perf_w_std(df, method, model, metric):
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
@ -60,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)"
+            df, method, model, "Output Tput (tok/s)")
        )
        mean = mean.tolist()
        std = None
@ -81,17 +80,18 @@ def main(args):
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
-    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
    with open(args.description) as f:
        description = f.read()
-    description = description.format(nightly_results_benchmarking_table=md_table)
+    description = description.format(
        nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
-if __name__ == "__main__":
+if __name__ == '__main__':
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from lmdeploy.serve.openai.api_client import APIClient
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
    echo "Container: vllm"
    # move to a completely irrelevant directory, to avoid import vllm from current folder
    export CURRENT_LLM_SERVING_ENGINE=vllm
-
+    
    return
  fi
 }
@ -95,14 +95,12 @@ json2args() {
 }
 kill_gpu_processes() {
-  pkill -f '[p]ython'
+  pkill -f python
-  pkill -f '[p]ython3'
+  pkill -f python3
-  pkill -f '[t]ritonserver'
+  pkill -f tritonserver
-  pkill -f '[p]t_main_thread'
+  pkill -f pt_main_thread
-  pkill -f '[t]ext-generation'
+  pkill -f text-generation
-  pkill -f '[l]mdeploy'
+  pkill -f lmdeploy
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pkill -f '[V]LLM'
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
@ -127,7 +125,7 @@ ensure_installed() {
 }
 run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
+  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
@ -227,7 +225,7 @@ run_serving_tests() {
      if [[ "$dataset_name" = "sharegpt" ]]; then
-        client_command="vllm bench serve \
+        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
@ -248,7 +246,7 @@ run_serving_tests() {
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-        client_command="vllm bench serve \
+        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
@ -267,13 +265,13 @@ run_serving_tests() {
          $client_args"
      else
-
+  
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
        exit 1
      fi
-
+        
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
@ -304,7 +302,7 @@ run_serving_tests() {
 }
 run_genai_perf_tests() {
-  # run genai-perf tests
+  # run genai-perf tests 
  # $1: a json file specifying genai-perf test cases
  local genai_perf_test_file
@ -313,14 +311,14 @@ run_genai_perf_tests() {
  # Iterate over genai-perf tests
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
+    test_name=$(echo "$params" | jq -r '.test_name')    
-
+    
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
-
+    
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@ -371,10 +369,10 @@ run_genai_perf_tests() {
        qps=$num_prompts
        echo "now qps is $qps"
      fi
-
+    
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
-
+      
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
@ -382,7 +380,7 @@ run_genai_perf_tests() {
      client_command="genai-perf profile \
        -m $model \
        --service-kind openai \
-        --backend "$backend" \
+        --backend vllm \
        --endpoint-type chat \
        --streaming \
        --url localhost:$port \
@ -415,7 +413,7 @@ prepare_dataset() {
  do
    cat sonnet.txt >> sonnet_4x.txt
  done
-
+  
 }
 main() {
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -31,20 +31,6 @@ check_gpus() {
  echo "GPU type is $gpu_type"
 }
 check_cpus() {
  # check the number of CPUs and NUMA Node and GPU type.
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
  fi
  declare -g gpu_type="cpu"
  echo "GPU type is $gpu_type"
 }
 check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
@ -83,22 +69,6 @@ json2args() {
  echo "$args"
 }
 json2envs() {
  # transforms the JSON string to environment variables.
  # example:
  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
  # output: VLLM_CPU_KVCACHE_SPACE=5
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map((.key ) + "=" + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
@ -126,8 +96,7 @@ kill_gpu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+
  pgrep VLLM | xargs -r kill -9
  # wait until GPU memory usage smaller than 1GB
  if command -v nvidia-smi; then
@ -165,7 +134,7 @@ upload_to_buildkite() {
 }
 run_latency_tests() {
-  # run latency tests using `vllm bench latency` command
+  # run latency tests using `benchmark_latency.py`
  # $1: a json file specifying latency test cases
  local latency_test_file
@ -189,26 +158,15 @@ run_latency_tests() {
    # get arguments
    latency_params=$(echo "$params" | jq -r '.parameters')
    latency_args=$(json2args "$latency_params")
    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    latency_envs=$(json2envs "$latency_environment_variables")
    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
+    if [[ $gpu_count -lt $tp ]]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      world_size=$(($tp*$pp))
+      continue
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
-    latency_command=" $latency_envs vllm bench latency \
+    latency_command="python3 benchmark_latency.py \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $latency_args"
@ -234,7 +192,7 @@ run_latency_tests() {
 }
 run_throughput_tests() {
-  # run throughput tests using `vllm bench throughput`
+  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases
  local throughput_test_file
@ -258,26 +216,15 @@ run_throughput_tests() {
    # get arguments
    throughput_params=$(echo "$params" | jq -r '.parameters')
    throughput_args=$(json2args "$throughput_params")
    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    throughput_envs=$(json2envs "$throughput_environment_variables")
    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
+    if [[ $gpu_count -lt $tp ]]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      world_size=$(($tp*$pp))
+      continue
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
-    throughput_command=" $throughput_envs vllm bench throughput \
+    throughput_command="python3 benchmark_throughput.py \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"
@ -302,7 +249,7 @@ run_throughput_tests() {
 }
 run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
+  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
@ -325,36 +272,18 @@ run_serving_tests() {
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
        max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"
-    # check if there is enough resources to run the test
+    # check if there is enough GPU to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
+    if [[ $gpu_count -lt $tp ]]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      world_size=$(($tp*$pp))
+      continue
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
    # check if server model and client model is aligned
@ -365,33 +294,23 @@ run_serving_tests() {
      continue
    fi
-    server_command="$server_envs python3 \
+    server_command="python3 \
      -m vllm.entrypoints.openai.api_server \
      $server_args"
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
-    # support remote vllm server
+    bash -c "$server_command" &
-    client_remote_args=""
+    server_pid=$!
-    if [[ -z "${REMOTE_HOST}" ]]; then
+
-      bash -c "$server_command" &
+    # wait until the server is alive
-      server_pid=$!
+    if wait_for_server; then
-      # wait until the server is alive
+      echo ""
-      if wait_for_server; then
+      echo "vllm server is up and running."
        echo ""
        echo "vLLM server is up and running."
      else
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
    else
-      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
+      echo ""
-      if [[ ${REMOTE_PORT} ]]; then
+      echo "vllm failed to start within the timeout period."
        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
      else
        client_remote_args=" --host=$REMOTE_HOST "
      fi
    fi
    # iterate over different QPS
@ -403,39 +322,35 @@ run_serving_tests() {
        echo "now qps is $qps"
      fi
-      # iterate over different max_concurrency
+      new_test_name=$test_name"_qps_"$qps
      for max_concurrency in $max_concurrency_list; do
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
        # pass the tensor parallel size to the client so that it can be displayed
        # on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
          --metadata "tensor_parallel_size=$tp" \
          $client_args $client_remote_args "
-        echo "Running test case $test_name with qps $qps"
+      # pass the tensor parallel size to the client so that it can be displayed
-        echo "Client command: $client_command"
+      # on the benchmark dashboard
      client_command="python3 benchmark_serving.py \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        --metadata "tensor_parallel_size=$tp" \
        $client_args"
-        bash -c "$client_command"
+      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
-        # record the benchmarking commands
+      bash -c "$client_command"
-        jq_output=$(jq -n \
+
-          --arg server "$server_command" \
+      # record the benchmarking commands
-          --arg client "$client_command" \
+      jq_output=$(jq -n \
-          --arg gpu "$gpu_type" \
+        --arg server "$server_command" \
-          '{
+        --arg client "$client_command" \
-            server_command: $server,
+        --arg gpu "$gpu_type" \
-            client_command: $client,
+        '{
-            gpu_type: $gpu
+          server_command: $server,
-          }')
+          client_command: $client,
-        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+          gpu_type: $gpu
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
      done
    done
    # clean up
@ -445,14 +360,7 @@ run_serving_tests() {
 }
 main() {
-  local ARCH
+  check_gpus
  ARCH=''
  if [ "$ON_CPU" == "1" ];then
     check_cpus
     ARCH='-cpu'
  else
     check_gpus
  fi
  check_hf_token
  # Set to v1 to run v1 benchmark
@ -465,7 +373,7 @@ main() {
  (which jq) || (apt-get update && apt-get -y install jq)
  (which lsof) || (apt-get update && apt-get install -y lsof)
-  # get the current IP address, required by `vllm bench serve` command
+  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
  export VLLM_LOGGING_LEVEL="WARNING"
@ -478,9 +386,9 @@ main() {
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
  # postprocess benchmarking results
  pip install tabulate pandas
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
 import json
@ -35,8 +34,10 @@ serving_column_mapping = {
 }
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
@ -55,16 +56,17 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)
    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+        serving_results = serving_results[list(
-            columns=serving_column_mapping
+            serving_column_mapping.keys())].rename(
-        )
+                columns=serving_column_mapping)
-    serving_md_table_with_headers = tabulate(
+    serving_md_table_with_headers = tabulate(serving_results,
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
+                                             headers='keys',
-    )
+                                             tablefmt='pipe',
                                             showindex=False)
    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split("\n")
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
-    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
@ -74,9 +76,10 @@ if __name__ == "__main__":
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
-        f.write("\n")
+        f.write('\n')
    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-        results = serving_results.to_dict(orient="records")
+
        results = serving_results.to_dict(orient='records')
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@ -11,7 +11,9 @@
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@ -1,30 +0,0 @@
 [
    {
        "test_name": "latency_llama8B_tp1",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    },
    {
        "test_name": "latency_llama8B_tp4",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -35,7 +35,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -88,7 +90,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -141,7 +145,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -191,7 +197,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -243,7 +251,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -295,7 +305,9 @@
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,610 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -1,820 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@ -1,168 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 100
        }
    },
    {
        "test_name": "serving_llama8B_pp6_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 6,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 100
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -7,6 +7,7 @@
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
@ -25,6 +26,7 @@
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
@ -43,6 +45,7 @@
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
@ -57,7 +60,8 @@
        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
        "qps_list": [2],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "speculative_config": {
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@ -1,32 +0,0 @@
 [
    {
        "test_name": "throughput_llama8B_tp1",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    },
    {
        "test_name": "throughput_llama8B_tp4",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    }
 ]
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -1,46 +0,0 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.format]
 docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,138 +1,76 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  - label: "Build wheel - CUDA 12.4"
  - label: "Build arm64 wheel - CUDA 12.9"
    id: build-wheel-arm64-cuda-12-9
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - block: "Build CUDA 12.8 wheel"
    key: block-build-cu128-wheel
  - label: "Build wheel - CUDA 12.8"
    depends_on: block-build-cu128-wheel
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
-  - block: "Build CUDA 12.6 wheel"
+  - label: "Build wheel - CUDA 12.1"
    key: block-build-cu126-wheel
    depends_on: ~
  - label: "Build wheel - CUDA 12.6"
    depends_on: block-build-cu126-wheel
    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
-  # x86 + CUDA builds
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-  - label: "Build wheel - CUDA 12.9"
+  # However, this block can be uncommented to save some compute hours.
-    depends_on: ~
+  # - block: "Build CUDA 11.8 wheel"
-    id: build-wheel-cuda-12-9
+  #   key: block-build-cu118-wheel
  - label: "Build wheel - CUDA 11.8"
    # depends_on: block-build-cu118-wheel
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
-  - label: "Build release image (x86)"
+  - block: "Build release image"
    depends_on: ~
-    id: build-release-image-x86
+    key: block-release-image-build
  - label: "Build release image"
    depends_on: block-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
  - label: "Create multi-arch manifest"
    depends_on:
      - build-release-image-x86
      - build-release-image-arm64
    id: create-multi-arch-manifest
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Annotate release workflow"
    depends_on:
      - create-multi-arch-manifest
      - build-wheel-cuda-12-8
      - build-wheel-cuda-12-6
      - build-wheel-cuda-12-9
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-release.sh"
  - label: "Build and publish TPU release image"
    depends_on: ~
    if: build.env("NIGHTLY") == "1"
    agents:
      queue: tpu_queue_postmerge
    commands:
      - "yes | docker system prune -a"
      - "git fetch --all"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
      - docker-login#v3.0.0:
-          username: vllmbot
+          username: vllm
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
-        key: release-version
+        key: "release-version"
  - block: "Build CPU release image"
    key: block-cpu-release-image-build
@ -144,30 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
    if: build.env("NIGHTLY") == "1"
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      - "docker push vllm/vllm-openai:nightly"
      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
      - docker-login#v3.0.0:
          username: vllmbot
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@ -1,31 +0,0 @@
 #!/bin/bash
 set -ex
 # Get release version and strip leading 'v' if present
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
 if [ -z "$RELEASE_VERSION" ]; then
  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
  exit 1
 fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel:
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
 \`\`\`
 To download and upload the image:
 \`\`\`
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
 docker tag vllm/vllm-openai vllm/vllm-openai:latest
 docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
 docker push vllm/vllm-openai:latest
 docker push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
 EOF 
--- a/.buildkite/scripts/ci-clean-log.sh
+++ b/.buildkite/scripts/ci-clean-log.sh
@ -1,17 +0,0 @@
 #!/bin/bash
 # Usage: ./ci_clean_log.sh ci.log
 # This script strips timestamps and color codes from CI log files.
 # Check if argument is given
 if [ $# -lt 1 ]; then
    echo "Usage: $0 ci.log"
    exit 1
 fi
 INPUT_FILE="$1"
 # Strip timestamps
 sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
 # Strip colorization
 sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -1,97 +0,0 @@
 #!/bin/bash
 set -ex
 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
 # This script uses DockerHub API to list and delete old tags with "nightly-" prefix
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 # Get DockerHub token from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    while true; do
        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
        if [ -z "$tags" ]; then
            break
        fi
        all_tags="$all_tags$tags"$'\n'
        page=$((page + 1))
    done
    # Sort by timestamp (newest first) and extract just the tag names
    echo "$all_tags" | sort -r | cut -d'|' -f2
 }
 delete_tag() {
    local tag_name="$1"
    echo "Deleting tag: $tag_name"
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
    else
        echo "Successfully deleted tag: $tag_name"
    fi
 }
 # Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
 echo "Fetching all tags from DockerHub..."
 all_tags=$(get_all_tags)
 if [ -z "$all_tags" ]; then
    echo "No tags found to clean up"
    exit 0
 fi
 # Count total tags
 total_tags=$(echo "$all_tags" | wc -l)
 echo "Found $total_tags tags"
 # Keep only the last 14 builds (including the current one)
 tags_to_keep=14
 tags_to_delete=$((total_tags - tags_to_keep))
 if [ $tags_to_delete -le 0 ]; then
    echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
    exit 0
 fi
 echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
 # Get tags to delete (skip the first $tags_to_keep tags)
 tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
 if [ -z "$tags_to_delete_list" ]; then
    echo "No tags to delete"
    exit 0
 fi
 # Delete old tags
 echo "Deleting old tags..."
 while IFS= read -r tag; do
    if [ -n "$tag" ]; then
        delete_tag "$tag"
        # Add a small delay to avoid rate limiting
        sleep 1
    fi
 done <<< "$tags_to_delete_list"
 echo "Cleanup completed successfully"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -3,9 +3,6 @@
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
 # Export Python path
 export PYTHONPATH=".."
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
@ -77,70 +74,31 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
 fi
 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
 fi
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
 if [[ $commands == *"pytest -v -s lora"* ]]; then
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
 fi
 #ignore certain kernels tests
-if [[ $commands == *" kernels/core"* ]]; then
+if [[ $commands == *" kernels "* ]]; then
  commands="${commands} \
-  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/test_attention_selector.py \
-  --ignore=kernels/core/test_permute_cols.py"
+  --ignore=kernels/test_blocksparse_attention.py \
-fi
+  --ignore=kernels/test_causal_conv1d.py \
-
+  --ignore=kernels/test_cutlass.py \
-if [[ $commands == *" kernels/attention"* ]]; then
+  --ignore=kernels/test_encoder_decoder_attn.py \
-  commands="${commands} \
+  --ignore=kernels/test_flash_attn.py \
-  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/test_int8_quant.py \
-  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/test_machete_gemm.py \
-  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/test_mamba_ssm.py \
-  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/test_marlin_gemm.py \
-  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/test_moe.py \
-  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/test_prefix_prefill.py \
-  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/test_rand.py \
-  --ignore=kernels/attention/test_attention.py"
+  --ignore=kernels/test_sampler.py \
-fi
+  --ignore=kernels/test_cascade_flash_attn.py \
-
+  --ignore=kernels/test_mamba_mixer2.py \
-if [[ $commands == *" kernels/quantization"* ]]; then
+  --ignore=kernels/test_aqlm.py \
-  commands="${commands} \
+  --ignore=kernels/test_machete_mm.py \
-  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/test_mha_attn.py \
-  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/test_block_fp8.py \
-  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/test_permute_cols.py"
  --ignore=kernels/quantization/test_block_int8.py \
  --ignore=kernels/quantization/test_marlin_gemm.py \
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
  --ignore=kernels/quantization/test_int8_kernel.py"
 fi
 if [[ $commands == *" kernels/mamba"* ]]; then
  commands="${commands} \
  --ignore=kernels/mamba/test_mamba_mixer2.py \
  --ignore=kernels/mamba/test_causal_conv1d.py \
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 fi
 if [[ $commands == *" kernels/moe"* ]]; then
  commands="${commands} \
  --ignore=kernels/moe/test_moe.py \
  --ignore=kernels/moe/test_cutlass_moe.py \
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
@ -164,6 +122,7 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
@ -181,8 +140,6 @@ fi
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
@ -203,7 +160,6 @@ if [[ $commands == *"--shard-id="* ]]; then
        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
        -e "PYTHONPATH=${MYPYTHONPATH}" \
        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
@ -234,7 +190,6 @@ else
          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
          -e "PYTHONPATH=${MYPYTHONPATH}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -5,13 +5,7 @@
 set -ex
 # Setup cleanup
-remove_docker_container() {
+remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
  if [[ -n "$container_id" ]]; then
      podman stop --all -t0
      podman rm -f "$container_id" || true
  fi
  podman system prune -f
 }
 trap remove_docker_container EXIT
 remove_docker_container
@ -19,31 +13,26 @@ remove_docker_container
 podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 # Run the image
-container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
+podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
 function cpu_tests() {
  # offline inference
-  podman exec -it "$container_id" bash -c "
+  podman exec cpu-test-ubi9-ppc bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
-  podman exec -it "$container_id" bash -c "
+  podman exec cpu-test-ubi9-ppc bash -c "
    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export container_id
 export -f cpu_tests
 timeout 40m bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -6,118 +6,89 @@ set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
 # used for TP/PP E2E test
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 export CMAKE_BUILD_PARALLEL_LEVEL=32
 # Setup cleanup
-remove_docker_container() {
+remove_docker_container() { 
-    set -e;
+    set -e; 
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
+    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
-
+  export BUILDKITE_BUILD_NUMBER=$3
  # list packages
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    pip list"
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pip list"
  # offline inference
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -v -s tests/kernels/test_onednn.py"
  # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    # Note: disable until supports V1
+    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
-
+    pytest -v -s tests/models/embedding/language -m cpu_model
-    # Note: disable Bart until supports V1
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
-    pytest -x -v -s tests/models/language/generation -m cpu_model \
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-                --ignore=tests/models/language/generation/test_bart.py
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
  # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pytest -x -s -v \
+    pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
  # Note: disable it until supports V1
  # Run AWQ test
-  # docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-  #   set -e
+    set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
+    pytest -s -v \
-  #   tests/quantization/test_ipex_quant.py"
+    tests/quantization/test_ipex_quant.py"
  # Run chunked-prefill and prefix-cache test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v -k cpu_model \
    tests/basic_correctness/test_chunked_prefill.py"  
  # online serving
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    export VLLM_CPU_KVCACHE_SPACE=10 
    export VLLM_CPU_OMP_THREADS_BIND=$1
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
    python3 benchmarks/benchmark_serving.py \
      --backend vllm \
      --dataset-name random \
      --model facebook/opt-125m \
      --num-prompts 20 \
      --endpoint /v1/completions \
      --tokenizer facebook/opt-125m"
  # Run multi-lora tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pytest -x -s -v \
+    pytest -s -v \
    tests/lora/test_qwen2vl.py"
  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
  # online serving: tp+dp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -16,7 +16,8 @@ DOCKER_BUILDKIT=1 docker build . \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
  --build-arg RUN_WHEEL_CHECK=false \
-  --build-arg torch_cuda_arch_list="9.0+PTX"
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
 # Setup cleanup
 remove_docker_container() { docker rm -f gh200-test || true; }
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -2,55 +2,23 @@
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -exuo pipefail
+set -ex
 # Try building the docker image
-cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 FROM gaudi-base-image:latest
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 RUN VLLM_TARGET_DEVICE=empty pip install .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 WORKDIR /workspace/
 RUN git clone https://github.com/vllm-project/vllm-gaudi.git
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 EOF
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
 # override the exit code of the script, so we need to use
-# separate remove_docker_containers and remove_docker_containers_and_exit
+# separate remove_docker_container and remove_docker_container_and_exit
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+remove_docker_container() { docker rm -f hpu-test || true; }
-trap 'remove_docker_containers; exit $EXITCODE;' EXIT
+remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
-remove_docker_containers
+trap remove_docker_container_and_exit EXIT
-
+remove_docker_container
 echo "Running HPU plugin v1 test"
 docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
  hpu-plugin-v1-test-env \
  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
 # Run the image and launch offline inference
 docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
  echo "Test with basic model passed"
 else
  echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
 fi
 # The trap will handle the container removal and final exit.
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -0,0 +1,54 @@
 #!/bin/bash
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
 set -v
 image_name="neuron/vllm-ci"
 container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
 if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
        # Remove dangling images (those that are not tagged and not used by any container)
        docker image prune -f
        # Remove unused volumes / force the system prune for old images as well.
        docker volume prune -f && docker system prune -f
        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() {
    docker image rm -f "${image_name}" || true;
 }
 trap remove_docker_container EXIT
 # Run the image
 docker run --rm -it --device=/dev/neuron0 --network bridge \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -1,167 +0,0 @@
 #!/bin/bash
 set -xu
 remove_docker_container() { 
    docker rm -f tpu-test || true;
 }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 cleanup_docker
 # For HF_TOKEN.
 source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c '
 set -e # Exit immediately if a command exits with a non-zero status.
 set -u # Treat unset variables as an error.
 echo "--- Starting script inside Docker container ---"
 # Create results directory
 RESULTS_DIR=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $RESULTS_DIR"
 # Install dependencies
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 echo "--- Hardware Information ---"
 # tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
 # --- Test Definitions ---
 # If a test fails, this function will print logs and will not cause the main script to exit.
 run_test() {
    local test_num=$1
    local test_name=$2
    local test_command=$3
    local log_file="$RESULTS_DIR/test_${test_num}.log"
    local actual_exit_code
    echo "--- TEST_$test_num: Running $test_name ---"
    # Execute the test command.
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
    actual_exit_code=$?
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
    if [ "$actual_exit_code" -ne 0 ]; then
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
        if [ -f "$log_file" ]; then
            cat "$log_file" >&2
        else
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
        fi
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
        return "$actual_exit_code" # Return the failure code
    else
        echo "TEST_$test_num ($test_name) PASSED."
        return 0 # Return success
    fi
 }
 # Helper function to call run_test and update the overall script exit code
 run_and_track_test() {
    local test_num_arg="$1"
    local test_name_arg="$2"
    local test_command_arg="$3"
    # Run the test
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
    local test_specific_exit_code=$?
    # If the test failed, set the overall script exit code to 1
    if [ "$test_specific_exit_code" -ne 0 ]; then
        # No need for extra echo here, run_test already logged the failure.
        overall_script_exit_code=1
    fi
 }
 # --- Actual Test Execution ---
 run_and_track_test 1 "test_struct_output_generate.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 2 "test_moe_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 3 "test_lora.py" \
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 run_and_track_test 4 "test_tpu_qkv_linear.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 run_and_track_test 5 "test_spmd_model_weight_loading.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 run_and_track_test 6 "test_kv_cache_update_kernel.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 run_and_track_test 7 "test_tpu_int8.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 else
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 fi
 exit "$overall_script_exit_code"
 ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 # Capture the exit code of the docker run command
 DOCKER_RUN_EXIT_CODE=$?
 # The trap will run for cleanup.
 # Exit the main script with the Docker run command's exit code.
 if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
    exit "$DOCKER_RUN_EXIT_CODE"
 else
    echo "Docker run command completed successfully."
    exit 0
 fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,175 +1,49 @@
 #!/bin/bash
-set -xu
+set -xue
 remove_docker_container() { 
    docker rm -f tpu-test || true; 
 }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
-cleanup_docker() {
+remove_docker_container() { docker rm -f tpu-test || true; }
-  # Get Docker's root directory
+trap remove_docker_container EXIT
-  docker_root=$(docker info -f '{{.DockerRootDir}}')
+# Remove the container that might not be cleaned up in the previous run.
-  if [ -z "$docker_root" ]; then
+remove_docker_container
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 cleanup_docker
 # For HF_TOKEN.
 source /etc/environment
-
+# Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c '
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-set -e # Exit immediately if a command exits with a non-zero status.
+    && python3 -m pip install pytest tpu-info \
-set -u # Treat unset variables as an error.
+    && python3 -m pip install lm_eval[api]==0.4.4 \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo HARDWARE \
    && tpu-info \
    && echo TEST_0 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
    && echo TEST_1 \
    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
    && echo TEST_3 \
    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
    && echo TEST_4 \
    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && echo TEST_5 \
    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
    && echo TEST_6 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
    && echo TEST_7 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
    && echo TEST_8 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
    && echo TEST_9 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
 echo "--- Starting script inside Docker container ---"
 # Create results directory
 RESULTS_DIR=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $RESULTS_DIR"
 # Install dependencies
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 echo "--- Hardware Information ---"
 # tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
 # --- Test Definitions ---
 # If a test fails, this function will print logs and will not cause the main script to exit.
 run_test() {
    local test_num=$1
    local test_name=$2
    local test_command=$3
    local log_file="$RESULTS_DIR/test_${test_num}.log"
    local actual_exit_code
    echo "--- TEST_$test_num: Running $test_name ---"
    # Execute the test command.
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
    actual_exit_code=$?
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
    if [ "$actual_exit_code" -ne 0 ]; then
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
        if [ -f "$log_file" ]; then
            cat "$log_file" >&2
        else
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
        fi
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
        return "$actual_exit_code" # Return the failure code
    else
        echo "TEST_$test_num ($test_name) PASSED."
        return 0 # Return success
    fi
 }
 # Helper function to call run_test and update the overall script exit code
 run_and_track_test() {
    local test_num_arg="$1"
    local test_name_arg="$2"
    local test_command_arg="$3"
    # Run the test
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
    local test_specific_exit_code=$?
    # If the test failed, set the overall script exit code to 1
    if [ "$test_specific_exit_code" -ne 0 ]; then
        # No need for extra echo here, run_test already logged the failure.
        overall_script_exit_code=1
    fi
 }
 # --- Actual Test Execution ---
 run_and_track_test 0 "test_perf.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
 run_and_track_test 1 "test_compilation.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
 run_and_track_test 2 "test_basic.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 run_and_track_test 4 "test_quantization_accuracy.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 run_and_track_test 5 "examples/offline_inference/tpu.py" \
    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
 run_and_track_test 6 "test_tpu_model_runner.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
 run_and_track_test 7 "test_sampler.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
 run_and_track_test 8 "test_topk_topp_sampler.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
 run_and_track_test 9 "test_multimodal.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 run_and_track_test 10 "test_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 else
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 fi
 exit "$overall_script_exit_code"
 ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 # Capture the exit code of the docker run command
 DOCKER_RUN_EXIT_CODE=$?
 # The trap will run for cleanup.
 # Exit the main script with the Docker run command's exit code.
 if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
    exit "$DOCKER_RUN_EXIT_CODE"
 else
    echo "Docker run command completed successfully."
    exit 0
 fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
-# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
 docker build -t ${image_name} -f docker/Dockerfile.xpu .
 # Setup cleanup
-remove_docker_container() {
+remove_docker_container() { 
-  docker rm -f "${container_name}" || true;
+  docker rm -f "${container_name}" || true; 
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
@ -23,27 +23,9 @@ docker run \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    -e "HF_TOKEN=${HF_TOKEN}" \
    -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
    --name "${container_name}" \
    "${image_name}" \
-    bash -c '
+    sh -c '
-    set -e
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    echo $ZE_AFFINITY_MASK
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_serial_utils.py
    pytest -v -s v1/test_utils.py
    pytest -v -s v1/test_metrics_reader.py
 '
--- a/.buildkite/scripts/rerun-test.sh
+++ b/.buildkite/scripts/rerun-test.sh
@ -1,18 +0,0 @@
 #!/bin/bash
 # Usage: ./rerun_test.sh path/to/test.py::test_name
 # Check if argument is given
 if [ $# -lt 1 ]; then
    echo "Usage: $0 path/to/test.py::test_name"
    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
    exit 1
 fi
 TEST=$1
 COUNT=1
 while pytest -sv "$TEST"; do
    COUNT=$((COUNT + 1))
    echo "RUN NUMBER ${COUNT}"
 done
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # run python-based benchmarks and upload the result to buildkite
-vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
-vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 # run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-vllm bench serve \
+python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name sharegpt \
    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@ -1,24 +0,0 @@
 #!/bin/bash
 set -euo pipefail
 docker_root=$(docker info -f '{{.DockerRootDir}}')
 if [ -z "$docker_root" ]; then
  echo "Failed to determine Docker root directory."
  exit 1
 fi
 echo "Docker root directory: $docker_root"
 # Check disk usage of the filesystem where Docker's root directory is located
 disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 # Define the threshold
 threshold=70
 if [ "$disk_usage" -gt "$threshold" ]; then
  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
  # Remove dangling images (those that are not tagged and not used by any container)
  docker image prune -f
  # Remove unused volumes / force the system prune for old images as well.
  docker volume prune -f && docker system prune --force --filter "until=24h" --all
  echo "Docker images and volumes cleanup completed."
 else
  echo "Disk usage is below $threshold%. No cleanup needed."
 fi
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@ -1,14 +0,0 @@
 # Environment config
 TEST_NAME=llama8b
 CONTAINER_NAME=tpu-test
 # vllm config
 MODEL=meta-llama/Llama-3.1-8B-Instruct
 MAX_NUM_SEQS=256
 MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
 EXPECTED_THROUGHPUT=8.0
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@ -1,90 +0,0 @@
 #!/bin/bash
 if [ ! -f "$1" ]; then
  echo "Error: The env file '$1' does not exist."
  exit 1  # Exit the script with a non-zero status to indicate an error
 fi
 ENV_FILE=$1
 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
 source $ENV_FILE
 remove_docker_container() { 
    docker rm -f $CONTAINER_NAME || true;
 }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 LOG_ROOT=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $LOG_ROOT"
 if [ -z "$HF_TOKEN" ]; then
  echo "Error: HF_TOKEN is not set or is empty."  
  exit 1
 fi
 # Make sure mounted disk or dir exists
 if [ ! -d "$DOWNLOAD_DIR" ]; then
    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
    exit 1
 fi
 echo "Run model $MODEL"
 echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
 -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
 --env-file $ENV_FILE \
 -e HF_TOKEN="$HF_TOKEN" \
 -e TARGET_COMMIT=$BUILDKITE_COMMIT \
 -e MODEL=$MODEL \
 -e WORKSPACE=/workspace \
 --name $CONTAINER_NAME \
 -d \
 --privileged \
 --network host \
 -v /dev/shm:/dev/shm \
 vllm/vllm-tpu-bm tail -f /dev/null
 echo "run script..."
 echo
 docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
 echo "copy result back..."
 VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
 BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
 docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
 docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
 throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
 if [ "$BUILDKITE" = "true" ]; then
  echo "Running inside Buildkite"
  buildkite-agent artifact upload "$VLLM_LOG" 
  buildkite-agent artifact upload "$BM_LOG"
 else
  echo "Not running inside Buildkite"
 fi
 #
 # compare the throughput with EXPECTED_THROUGHPUT 
 # and assert meeting the expectation
 # 
 if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
  echo "Failed to get the throughput"
  exit 1
 fi
 if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
  exit 1
 fi
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@ -1,14 +0,0 @@
 # Environment config
 TEST_NAME=llama8bw8a8
 CONTAINER_NAME=tpu-test
 # vllm config
 MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
 MAX_NUM_SEQS=128
 MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
 EXPECTED_THROUGHPUT=10.0
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -1,93 +0,0 @@
 #!/bin/bash
 set -euo pipefail
 VLLM_LOG="$WORKSPACE/vllm_log.txt"
 BM_LOG="$WORKSPACE/bm_log.txt"
 if [ -n "$TARGET_COMMIT" ]; then
  head_hash=$(git rev-parse HEAD)
  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
    exit 1
  fi
 fi
 echo "model: $MODEL"
 echo
 #
 # create a log folder
 #
 mkdir "$WORKSPACE/log"
 # TODO: Move to image building.
 pip install pandas
 pip install datasets
 #
 # create sonnet_4x
 #
 echo "Create sonnet_4x.txt"
 echo "" > benchmarks/sonnet_4x.txt
 for _ in {1..4}
 do
  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
 done
 #
 # start vllm service in backend
 #
 echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
 VLLM_USE_V1=1 vllm serve $MODEL \
 --seed 42 \
 --max-num-seqs $MAX_NUM_SEQS \
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 --no-enable-prefix-caching \
 --download_dir $DOWNLOAD_DIR \
 --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
 for i in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
        exit 1
    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
        echo "Application started"
        break
    else
        echo "wait for 10 seconds..."
        sleep 10
    fi
 done
 #
 # run test
 #
 echo "run benchmark test..."
 echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
    --model $MODEL  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
    --sonnet-input-len $INPUT_LEN \
    --sonnet-output-len $OUTPUT_LEN \
    --ignore-eos > "$BM_LOG"
 echo "completed..."
 echo
 throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 echo "throughput: $throughput"
 echo
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -14,19 +14,8 @@ fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
+# Rename 'linux' to 'manylinux1' in the wheel filename
-arch=$(uname -m)
+new_wheel="${wheel/linux/manylinux1}"
 if [[ $arch == "x86_64" ]]; then
    manylinux_version="manylinux1"
 elif [[ $arch == "aarch64" ]]; then
    manylinux_version="manylinux2014"
 else
    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
    manylinux_version="manylinux1"
 fi
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
@ -58,15 +47,14 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
+    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
+    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
+elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
+    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu129 wheels (default wheels) as it
+    # only upload index.html for cu124 wheels (default wheels)
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -75,17 +63,15 @@ fi
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
+    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
+    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
+elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
+    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu129 wheels (default wheels) as it
+    # only upload index.html for cu124 wheels (default wheels)
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.gemini/config.yaml
+++ b/.gemini/config.yaml
@ -1,6 +0,0 @@
 # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
 have_fun: false  # Just review the code
 code_review:
  comment_severity_threshold: HIGH  # Reduce quantity of comments
  pull_request_opened:
    summary: false  # Don't summarize the PR in a separate comment
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,89 +5,37 @@
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/layers/mamba @tdoublep
+/vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96
-/vllm/v1/sample @22quinn @houseroad
+CMakeLists.txt @tlrmchlsmth
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/structured_output @mgoin @russellb
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
-/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
 /tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
+/tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
-/tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/structured_output @mgoin @russellb
-/tests/weight_loading @mgoin @youkaichao @yewentao256
+/tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 # Docs
 /docs @hmellor
 mkdocs.yaml @hmellor
 # CPU
 /vllm/v1/worker/^cpu @bigPYJ1151
 /csrc/cpu @bigPYJ1151
 /vllm/platforms/cpu.py @bigPYJ1151
 /cmake/cpu_extension.cmake @bigPYJ1151
 /docker/Dockerfile.cpu @bigPYJ1151
 # Intel GPU
 /vllm/v1/worker/^xpu @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang
 # Qwen-specific files
 /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 /vllm/model_executor/models/qwen* @sighingnow
 # MTP-specific files
 /vllm/model_executor/models/deepseek_mtp.py @luccafong
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 # Kernels
 /vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
 /docker/Dockerfile.rocm* @gshtras
 /vllm/v1/attention/backends/rocm*.py @gshtras
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -8,16 +8,6 @@ body:
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: markdown
  attributes:
    value: |
      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
      - Passwords or authentication credentials
      - Private URLs or endpoints
      - Personal or confidential data
      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 - type: textarea
  attributes:
    label: Your current environment
@ -31,12 +21,12 @@ body:
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
-      <summary>The output of <code>python collect_env.py</code></summary>
+      <summary>The output of `python collect_env.py`</summary>
      ```text
      Your output of `python collect_env.py` here
      ```
-
+      
      </details>
  validations:
    required: true
@ -85,20 +75,20 @@ body:
      ```
      ```
-      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
+      The error message you got, with the full traceback.
      ```
  validations:
    required: true
 - type: markdown
  attributes:
-    value: |
+    value: >
-      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
-      Thanks for reporting 🙏!
+      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/ISSUE_TEMPLATE/450-ci-failure.yml
+++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml
@ -1,69 +0,0 @@
 name: 🧪 CI failure report
 description: Report a failing test.
 title: "[CI Failure]: "
 labels: ["ci-failure"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Include the name of the failing Buildkite step and test file in the title.
 - type: input
  attributes:
    label: Name of failing test
    description: |
      Paste in the fully-qualified name of the failing test from the logs.
    placeholder: |
      `path/to/test_file.py::test_name[params]`
  validations:
    required: true
 - type: checkboxes
  attributes:
    label: Basic information
    description: Select all items that apply to the failing test.
    options:
      - label: Flaky test
      - label: Can reproduce locally
      - label: Caused by external libraries (e.g. bug in `transformers`)
 - type: textarea
  attributes:
    label: 🧪 Describe the failing test
    description: |
      Please provide a clear and concise description of the failing test.
    placeholder: |
      A clear and concise description of the failing test.
      ```
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
      ```
  validations:
    required: true
 - type: textarea
  attributes:
    label: 📝 History of failing test
    description: |
      Since when did the test start to fail?
      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
    placeholder: |
      Approximate timeline and/or problematic PRs
      A link to the Buildkite analytics of the failing test (if available)
  validations:
    required: true
 - type: textarea
  attributes:
    label: CC List.
    description: >
      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
 - type: markdown
  attributes:
    value: >
      Thanks for reporting 🙏!
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -46,7 +46,7 @@ body:
 - type: markdown
  attributes:
    value: >
-      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
+      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,21 +1,6 @@
-<!-- markdownlint-disable -->
+FILL IN THE PR DESCRIPTION HERE
 PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
-## Purpose
+FIX #xxxx (*link existing issues this PR will resolve*)
-## Test Plan
+<!--- pyml disable-next-line no-emphasis-as-heading -->
-
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
 ## Test Result
 ---
 <details>
 <summary> Essential Elements of an Effective PR Description Checklist </summary>
 - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
 - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
 - [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
 </details>
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -27,22 +27,6 @@ pull_request_rules:
      add:
        - ci/build
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
      - files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
      - files~=^vllm/model_executor/models/.*deepseek.*\.py
      - files~=^vllm/reasoning/.*deepseek.*\.py
      - files~=^vllm/transformers_utils/.*deepseek.*\.py
      - title~=(?i)DeepSeek
  actions:
    label:
      add:
        - deepseek
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
@ -52,21 +36,6 @@ pull_request_rules:
      add:
        - frontend
 - name: label-llama
  description: Automatically apply llama label
  conditions:
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
      - files~=^vllm/model_executor/models/.*llama.*\.py
      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
      - title~=(?i)llama
  actions:
    label:
      add:
        - llama
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
@ -74,98 +43,23 @@ pull_request_rules:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
      - files~=^tests/models/multimodal/
      - files~=^tests/models/*/audio_language/
      - files~=^tests/models/*/vision_language/
      - files=tests/models/test_vision.py
  actions:
    label:
      add:
        - multi-modality
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
  actions:
    label:
      add:
        - new-model
 - name: label-performance
  description: Automatically apply performance label
  conditions:
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
      - files~=^tests/benchmarks/
      - files~=^\.buildkite/nightly-benchmarks/
  actions:
    label:
      add:
        - performance
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
      - files~=^vllm/model_executor/models/.*qwen.*\.py
      - files~=^vllm/reasoning/.*qwen.*\.py
      - title~=(?i)Qwen
  actions:
    label:
      add:
        - qwen
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
      - title~=(?i)gpt[-_]?oss
  actions:
    label:
      add:
        - gpt-oss
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
      - files~=^requirements/rocm.*\.txt
      - files~=^vllm/attention/backends/rocm.*\.py
      - files~=^vllm/attention/ops/rocm.*\.py
      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
      - files~=^tests/kernels/.*_rocm.*\.py
      - files=vllm/platforms/rocm.py
      - title~=(?i)AMD
      - title~=(?i)ROCm
  actions:
    label:
      add:
        - rocm
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
    - or:
-      - files~=^benchmarks/structured_schemas/
+      - files~=^vllm/model_executor/guided_decoding/
-      - files=benchmarks/benchmark_serving_structured_output.py
+      - files=tests/model_executor/test_guided_processors.py
-      - files=benchmarks/run_structured_output_benchmark.sh
+      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=docs/features/structured_outputs.md
+      - files=benchmarks/benchmark_serving_guided.py
-      - files=examples/offline_inference/structured_outputs.py
+      - files=benchmarks/benchmark_guided.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_guided_generate.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
      add:
@ -175,12 +69,9 @@ pull_request_rules:
  description: Automatically apply speculative-decoding label
  conditions:
    - or:
-      - files~=^vllm/v1/spec_decode/
+      - files~=^vllm/spec_decode/
-      - files~=^tests/v1/spec_decode/
+      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
-      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
+      - files~=^tests/spec_decode/
      - files~=^vllm/model_executor/models/.*eagle.*\.py
      - files=vllm/model_executor/models/mlp_speculator.py
      - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
  actions:
    label:
      add:
@ -227,26 +118,6 @@ pull_request_rules:
      remove:
        - tpu
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
      - files=examples/offline_inference/chat_with_tools.py
      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
  actions:
    label:
      add:
        - tool-calling
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
@ -262,31 +133,6 @@ pull_request_rules:
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 - name: assign reviewer for tensorizer changes
  conditions:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
      - files~=^tests/tensorizer_loader/
  actions:
    assign:
      users:
        - "sangstar"
 - name: assign reviewer for modelopt changes
  conditions:
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
        - files~=^tests/models/quantization/test_modelopt\.py$
        - files~=^tests/quantization/test_modelopt\.py$
        - files~=^tests/models/quantization/test_nvfp4\.py$
        - files~=^docs/features/quantization/modelopt\.md$
  actions:
    assign:
      users:
        - "Edwardf0t1"
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@ -1,21 +0,0 @@
 # scale-config.yml:
 #   Powers what instance types are available for GHA auto-scaled
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 # runner_types:
 #   runner_label:
 #     instance_type: m4.large
 #     os: linux
 #     # min_available defaults to the global cfg in the ALI Terraform
 #     min_available: undefined
 #     # when max_available value is not defined, no max runners is enforced
 #     max_available: undefined
 #     disk_size: 50
 #     is_ephemeral: true
 runner_types:
  linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: true
    os: linux
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -15,18 +15,18 @@ NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"
-# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
+# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
-sed -i '/<!--.*-->$/d' "${NEW}"
+sed -i '/FIX #xxxx.*$/d' "${NEW}"
-# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
+# Remove "FILL IN THE PR DESCRIPTION HERE"
-sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
+sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
 # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
 sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 python3 - <<EOF
-import regex as re
+import re
 with open("${NEW}", "r") as file:
    content = file.read()
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -1,6 +1,4 @@
 name: Add label on auto-merge enabled
 permissions:
    pull-requests: write
 on:
    pull_request_target:
        types:
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -16,16 +16,11 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.12'
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip
          python3 -m pip install regex
      - name: Update PR description
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
+        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -1,309 +0,0 @@
 name: Label issues based on keywords
 on:
  issues:
    types: [opened, edited, reopened]
 permissions:
  issues: write          # needed so the workflow can add labels
  contents: read
 concurrency:
  group: issue-labeler-${{ github.event.issue.number }}
  cancel-in-progress: true
 jobs:
  add-labels:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
        with:
          script: |
            // Configuration: Add new labels and keywords here
            const labelConfig = {
              rocm: {
                // Keyword search - matches whole words only (with word boundaries)
                keywords: [
                  {
                    term: "composable kernel",
                    searchIn: "both"
                  },
                  {
                    term: "rccl",
                    searchIn: "body"  // only search in body
                  },
                  {
                    term: "migraphx",
                    searchIn: "title"  // only search in title
                  },
                  {
                    term: "hipgraph",
                    searchIn: "both"
                  },
                  {
                    term: "ROCm System Management Interface",
                    searchIn: "body"
                  },
                ],
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
                    term: "VLLM_ROCM_",
                    searchIn: "both"
                  },
                  {
                    term: "aiter",
                    searchIn: "title"
                  },
                  {
                    term: "rocm",
                    searchIn: "title"
                  },
                  {
                    term: "amd",
                    searchIn: "title"
                  },
                  {
                    term: "hip-",
                    searchIn: "both"
                  },
                  {
                    term: "gfx",
                    searchIn: "both"
                  },
                  {
                    term: "cdna",
                    searchIn: "both"
                  },
                  {
                    term: "rdna",
                    searchIn: "both"
                  },
                  {
                    term: "torch_hip",
                    searchIn: "body"  // only in body
                  },
                  {
                    term: "_hip",
                    searchIn: "both"
                  },
                  {
                    term: "hip_",
                    searchIn: "both"
                  },
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
                    pattern: "\\bmi\\d{3}[a-z]*\\b",
                    description: "AMD GPU names (mi + 3 digits + optional letters)",
                    flags: "gi",
                    searchIn: "both"  // "title", "body", or "both"
                  }
                ],
              },
            };
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
                  return new RegExp(`\\b${escapedTerm}\\b`, "gi");
                case 'substring':
                  // Substring search - matches anywhere in the text
                  return new RegExp(escapedTerm, "gi");
                default:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
                  searchIn = 'both'; // default
                } else {
                  term = termConfig.term;
                  searchIn = termConfig.searchIn || 'both';
                  pattern = termConfig.pattern;
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
                const termMatches = [];
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
                  if (lineMatches) {
                    lineMatches.forEach(match => {
                      termMatches.push({
                        match: match,
                        lineNumber: lineIndex + 1,
                        lineContent: line.trim(),
                        searchType: searchType,
                        searchLocation: searchLocation,
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
                        context: line.length > 100 ? 
                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
                          : line.trim()
                      });
                    });
                  }
                });
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
                    searchType: searchType,
                    searchLocation: searchLocation,
                    searchIn: searchIn,
                    pattern: pattern,
                    matches: termMatches,
                    count: termMatches.length
                  });
                }
              }
              return matches;
            }
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
                    if (match.description) {
                      core.notice(`       Description: ${match.description}`);
                    }
                    core.notice(`       Context: ${match.context}`);
                    if (match.lineContent !== match.context) {
                      core.notice(`       Full line: ${match.lineContent}`);
                    }
                  });
                }
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
                const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
                  await github.rest.issues.addLabels({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: context.issue.number,
                    labels: [labelName],
                  });
                  core.notice(`Label "${labelName}" added. ${reason}`);
                  return true;
                }
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
            // Process all configured labels
            const processLabels = Object.entries(labelConfig)
              .map(([labelName, config]) => processLabel(labelName, config));
            const labelsAdded = await Promise.all(processLabels);
            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -0,0 +1,82 @@
 name: Lint and Deploy Charts
 on: pull_request
 jobs:
  lint-and-deploy:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
      - name: Set up Helm
        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
        with:
          version: v3.14.4
       #Python is required because ct lint runs Yamale and yamllint which require Python.
      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.13'
      - name: Set up chart-testing
        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
        with:
          version: v3.10.1
      - name: Run chart-testing (lint)
        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
      - name: Setup minio
        run: |
          docker network create vllm-net
          docker run -d -p 9000:9000 --name minio --net vllm-net \
                     -e "MINIO_ACCESS_KEY=minioadmin" \
                     -e "MINIO_SECRET_KEY=minioadmin" \
                     -v /tmp/data:/data \
                     -v /tmp/config:/root/.minio \
                     minio/minio server /data
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          export AWS_EC2_METADATA_DISABLED=true
          mkdir opt-125m
          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
      - name: Create kind cluster
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
      - name: Build the Docker image vllm cpu
        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
          docker pull amazon/aws-cli:2.6.4
          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
          kind load docker-image vllm-cpu-env:latest --name chart-testing
          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
          kubectl create ns ns-vllm
      - name: Run chart-testing (install)
        run: |
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
          sleep 10
          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
                  --header "Content-Type: application/json" \
                  --data '{
                          "model": "opt-125m",
                          "prompt": "San Francisco is a",
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
          echo "$CODE"
--- a/.github/workflows/matchers/markdownlint.json
+++ b/.github/workflows/matchers/markdownlint.json
@ -1,17 +0,0 @@
 {
  "problemMatcher": [
    {
      "owner": "markdownlint",
      "pattern": [
        {
          "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
          "file": 1,
          "line": 2,
          "column": 3,
          "code": 4,
          "message": 5
        }
      ]
    }
  ]
 }
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -5,23 +5,15 @@ on:
  push:
    branches: [main]
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 permissions:
  contents: read
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
    - run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
      with:
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -0,0 +1,111 @@
 # This workflow will upload a Python Package to Release asset
 # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
 name: Create Release
 on:
  push:
    tags:
      - v*
 # Needed to create release and upload assets
 permissions:
  contents: write
 jobs:
  release:
    # Retrieve tag and create release
    name: Create Release
    runs-on: ubuntu-latest
    outputs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Extract branch info
        shell: bash
        run: |
          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
      - name: Create Release
        id: create_release
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          script: |
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)
  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
  # wheel:
  #   name: Build Wheel
  #   runs-on: ${{ matrix.os }}
  #   needs: release
  #   strategy:
  #     fail-fast: false
  #     matrix:
  #         os: ['ubuntu-20.04']
  #         python-version: ['3.9', '3.10', '3.11', '3.12']
  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
  #         cuda-version: ['11.8', '12.1']
  #   steps:
  #     - name: Checkout
  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
  #     - name: Setup ccache
  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
  #       with:
  #         create-symlink: true
  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
  #     - name: Set up Linux Env
  #       if: ${{ runner.os == 'Linux' }}
  #       run: |
  #         bash -x .github/workflows/scripts/env.sh
  #     - name: Set up Python
  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
  #       with:
  #           python-version: ${{ matrix.python-version }}
  #     - name: Install CUDA ${{ matrix.cuda-version }}
  #       run: |
  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
  #       run: |
  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
  #     - name: Build wheel
  #       shell: bash
  #       env:
  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
  #       run: |
  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
  #         asset_name=${wheel_name//"linux"/"manylinux1"}
  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
  #     - name: Upload Release Asset
  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
  #       env:
  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  #       with:
  #         upload_url: ${{ needs.release.outputs.upload_url }}
  #         asset_path: ./dist/${{ env.wheel_name }}
  #         asset_name: ${{ env.asset_name }}
  #         asset_content_type: application/*
      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
      # - name: Publish package
      #   uses: pypa/gh-action-pypi-publish@release/v1.8
      #   with:
      #     repository-url: https://test.pypi.org/legacy/
      #     password: ${{ secrets.PYPI_API_TOKEN }}
      #     skip-existing: true
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -1,6 +1,4 @@
 name: PR Reminder Comment Bot
 permissions:
  pull-requests: write
 on:
  pull_request_target:
    types: [opened]
@ -12,43 +10,16 @@ jobs:
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
-            try {
+            github.rest.issues.createComment({
-              // Get the PR author
+              owner: context.repo.owner,
-              const prAuthor = context.payload.pull_request.user.login;
+              repo: context.repo.repo,
-              
+              issue_number: context.issue.number,
-              // Check if this is the author's first PR in this repository
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-              // Use GitHub's search API to find all PRs by this author
+                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-              const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
+                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
-                q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
+                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                per_page: 100  
+                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-              });
+                '🚀'
-              
+            })
              const authorPRCount = searchResults.total_count;
              console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
              // Only post comment if this is the first PR (only one PR by this author)
              if (authorPRCount === 1) {
                console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
                await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: context.issue.number,
                body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
                  '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
                  'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
                  'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
                  '🚀'
                });
              } else {
                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
              }
            } catch (error) {
              console.error('Error checking PR history or posting comment:', error);
              // Don't fail the workflow, just log the error
            }
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -15,6 +15,7 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 bash tools/check_repo.sh
--- a/.gitignore
+++ b/.gitignore
@ -3,9 +3,7 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
-
+!vllm/vllm_flash_attn/fa_utils.py
 # triton jit 
 .triton
 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -80,6 +78,10 @@ instance/
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
 # PyBuilder
 .pybuilder/
 target/
@ -149,9 +151,6 @@ venv.bak/
 # mkdocs documentation
 /site
 docs/argparse
 docs/examples/*
 !docs/examples/README.md
 # mypy
 .mypy_cache/
@ -205,8 +204,5 @@ benchmarks/**/*.json
 actionlint
 shellcheck*/
-# Ignore moe/marlin_moe gen code
+# Ingore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@ -1,13 +0,0 @@
 MD007:
  indent: 4
 MD013: false
 MD024:
  siblings_only: true
 MD033: false
 MD042: false
 MD045: false
 MD046: false
 MD051: false
 MD052: false
 MD053: false
 MD059: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,59 +11,51 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
+  rev: v0.9.3
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
-  - id: ruff-format
+- repo: https://github.com/codespell-project/codespell
-    files: ^(.buildkite|benchmarks|examples)/.*
+  rev: v2.4.0
 - repo: https://github.com/crate-ci/typos
  rev: v1.35.5
  hooks:
-  - id: typos
+  - id: codespell
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
+  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
+  rev: v19.1.7
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/igorshubovych/markdownlint-cli
+- repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.45.0
+  rev: v0.9.27
  hooks:
-  - id: markdownlint
+  - id: pymarkdown
-    exclude: '.*\.inc\.md'
+    args: [fix]
    stages: [manual] # Only run in CI
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.17
+  rev: 0.6.2
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+      args: [requirements/test.in, -o, requirements/test.txt]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
    entry: python tools/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
    stages: [pre-commit] # Don't run in CI
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
@ -109,8 +101,8 @@ repos:
    args:
      - -c
      - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
        fi
    language: system
    verbose: true
@ -120,11 +112,6 @@ repos:
    entry: python tools/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
    entry: python tools/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
    name: Check for spaces in all filenames
    entry: bash
@ -138,39 +125,12 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/update-dockerfile-graph.sh
    language: script
-  - id: enforce-import-regex-instead-of-re
+    files: ^docker/Dockerfile$
    name: Enforce import regex as re
    entry: python tools/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
    entry: python tools/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
    entry: python tools/check_pickle_imports.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [pathspec, regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/validate_config.py
    language: python
    types: [python]
    pass_filenames: true
    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
-    entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
    language: system
    verbose: true
    pass_filenames: false
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -7,12 +7,13 @@ build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
  jobs:
    post_checkout:
      - git fetch --unshallow || true
-mkdocs:
+sphinx:
-  configuration: mkdocs.yaml
+  configuration: docs/source/conf.py
  fail_on_warning: true
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@ -23,14 +24,14 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 # Prevent installation of dependencies (cutlass) by default.
 install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@ -45,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
 #
 # Try to find python package with an executable that exactly matches
@ -79,15 +80,6 @@ endif()
 #
 find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 endif()
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -171,6 +163,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 #
 # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -181,6 +174,9 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 #
 # Set rocm version dev int.
 #
 if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -188,6 +184,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
  #
  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -230,33 +227,31 @@ endif()
 #
 set(VLLM_EXT_SRC
  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/attention/merge_attn_states.cu"
  "csrc/attention/vertical_slash_index.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
-  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+  # Please keep this in sync with FetchContent_Declare line below.
  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -274,7 +269,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG ${CUTLASS_REVISION}
+        GIT_TAG v3.8.0
        GIT_PROGRESS TRUE
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -286,16 +281,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  FetchContent_MakeAvailable(cutlass)
  list(APPEND VLLM_EXT_SRC
    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/cutlass_extensions/common.cpp"
+    "csrc/cutlass_extensions/common.cpp")
    "csrc/attention/mla/cutlass_mla_entry.cu"
    "csrc/quantization/fp8/per_token_group_quant.cu")
  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -304,72 +299,20 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
    #
    # For the Marlin kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
    # Generate sources:
    set(MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=$PYTHONPATH
          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
        RESULT_VARIABLE marlin_generation_result
        OUTPUT_VARIABLE marlin_generation_result
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
      )
      if (NOT marlin_generation_result EQUAL 0)
        message(FATAL_ERROR "Marlin generation failed."
                            " Result: \"${marlin_generation_result}\""
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
      else()
        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
            CACHE STRING "Last run Marlin generate script hash" FORCE)
        message(STATUS "Marlin generation completed successfully.")
      endif()
    else()
      message(STATUS "Marlin generation script has not changed, skipping generation.")
    endif()
    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_SRCS}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
  else()
    message(STATUS "Not building Marlin kernels as no compatible archs found"
@ -397,7 +340,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.0 or later
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
@ -413,7 +356,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running FP8 quantized models on "
@ -424,45 +367,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
-
+  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
    # Let scaled_mm_c2x know it doesn't need to build these arches
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
                     "later if you intend on running FP8 quantized models on "
                     "Blackwell.")
    else()
      message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -473,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
                     "later if you intend on running FP8 quantized models on "
@ -487,9 +398,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
  # (Build 8.9 for FP8)
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@ -516,7 +426,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
  # require CUDA 12.2 or later (and only work on Hopper).
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -525,7 +435,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                     "if you intend on running FP8 sparse quantized models on Hopper.")
@ -535,41 +445,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
  # CUDA 12.8 or later
  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
    # clear FP4_ARCHS
    set(FP4_ARCHS)
  endif()
  # FP4 Archs and flags
  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -577,35 +463,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(FP4_ARCHS)
  endif()
-  # CUTLASS MLA Archs and flags
+  #
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
      "csrc/attention/mla/cutlass_mla_kernels.cu"
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${MLA_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
    # Add MLA-specific include directories only to MLA source files
    set_source_files_properties(${SRCS}
      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
  else()
    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
    # clear MLA_ARCHS
    set(MLA_ARCHS)
  endif()
  # CUTLASS MoE kernels
-  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # if it's possible to compile MoE kernels that use its output.
+  # to compile MoE kernels that use its output.
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -619,66 +486,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "if you intend on running FP8 quantized MoE models on Hopper.")
    else()
      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
                     "in CUDA target architectures.")
    endif()
  endif()
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
                     "if you intend on running FP8 quantized MoE models on Blackwell.")
    else()
      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
                     "in CUDA target architectures.")
    endif()
  endif()
  # moe_data.cu is used by all CUTLASS MoE kernels.
  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
      message(STATUS "Not building moe_data as CUDA Compiler version is "
                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
                     "if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
    else()
      message(STATUS "Not building moe_data as no compatible archs found "
                     "in CUDA target architectures.")
    endif()
  endif()
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
                     "if you intend on running FP8 quantized MoE models on Blackwell.")
    else()
      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
@ -689,7 +496,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The machete kernels only work on hopper and require CUDA 12.0 or later.
  # Only build Machete kernels if we are building for something compatible with sm90a
  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
    #
    # For the Machete kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
@ -741,7 +548,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
        AND MACHETE_ARCHS)
      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@ -752,44 +559,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "found in CUDA target architectures")
    endif()
  endif()
  # Only build W4A8 kernels if we are building for something compatible with sm90a
  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
    set(SRCS
       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${W4A8_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
        AND W4A8_ARCHS)
      message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running w4a16 quantized models on "
                     "Hopper.")
    else()
      message(STATUS "Not building W4A8 kernels as no compatible archs "
                     "found in CUDA target architectures")
    endif()
  endif()
 # if CUDA endif
 endif()
 if (VLLM_GPU_LANG STREQUAL "HIP")
  # Add QuickReduce kernels
  list(APPEND VLLM_EXT_SRC
    "csrc/custom_quickreduce.cu"
  )
 # if ROCM endif
 endif()
 message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
@ -819,17 +591,7 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/topk_softmax_kernels.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC
+  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
    "csrc/moe/moe_wna16.cu"
    "csrc/moe/grouped_topk_kernels.cu")
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  set(MOE_PERMUTE_SRC
      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
      "csrc/moe/moe_permute_unpermute_op.cu")
  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
 endif()
 set_gencode_flags_for_srcs(
@ -845,8 +607,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    #
@ -864,7 +625,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$PYTHONPATH
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
@ -890,10 +651,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set_gencode_flags_for_srcs(
      SRCS "${MOE_WNAA16_MARLIN_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
@ -912,8 +669,6 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)
@ -923,7 +678,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  set(VLLM_ROCM_EXT_SRC
    "csrc/rocm/torch_bindings.cpp"
    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")
  define_gpu_extension_target(
@ -940,7 +694,5 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
    # vllm-flash-attn should be last as it overwrites some CMake functions
    include(cmake/external_projects/vllm_flash_attn.cmake)
 endif ()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,3 +1,3 @@
 # Contributing to vLLM
-You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
--- a/Show More
+++ b/Show More
`@ -1,3 +1,3 @@`
	`# Contributing to vLLM`	`# Contributing to vLLM`

	`You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).`	`You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).`