Merge branch 'main' into optimize-prefix-caching-scheduling

simplify code
format
2025-10-21 23:48:57 +08:00 · 2024-06-04 00:20:15 +00:00 · 2024-06-03 03:36:38 +00:00 · 2024-06-02 00:02:54 +00:00 · 2024-06-02 00:01:30 +00:00
3394 changed files with 93694 additions and 717738 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,53 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+MAX_SIZE_MB = 200
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
 VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 def print_top_10_largest_files(zip_file):
-    """Print the top 10 largest files in the given zip file."""
+    with zipfile.ZipFile(zip_file, 'r') as z:
    with zipfile.ZipFile(zip_file, "r") as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
-            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
+            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
 def check_wheel_size(directory):
    """Check the size of .whl files in the given directory."""
    for root, _, files in os.walk(directory):
-        for file_name in files:
+        for f in files:
-            if file_name.endswith(".whl"):
+            if f.endswith(".whl"):
-                wheel_path = os.path.join(root, file_name)
+                wheel_path = os.path.join(root, f)
-                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
+                wheel_size = os.path.getsize(wheel_path)
-                if wheel_size_mb > VLLM_MAX_SIZE_MB:
+                wheel_size_mb = wheel_size / (1024 * 1024)
                if wheel_size_mb > MAX_SIZE_MB:
                    print(
-                        f"Not allowed: Wheel {wheel_path} is larger "
+                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
-                        f"({wheel_size_mb:.2f} MB) than the limit "
+                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
                        f"({VLLM_MAX_SIZE_MB} MB)."
                    )
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
-                    print(
+                    print(f"Wheel {wheel_path} is within the allowed size "
-                        f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb} MB).")
                        f"({wheel_size_mb:.2f} MB)."
                    )
    return 0
 if __name__ == "__main__":
-    if len(sys.argv) < 2:
+    import sys
-        print("Usage: python check-wheel-size.py <directory>")
+    sys.exit(check_wheel_size(sys.argv[1]))
        sys.exit(1)
    directory = sys.argv[1]
    sys.exit(check_wheel_size(directory))
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 cd -
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,46 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import os
 template = """<!DOCTYPE html>
 <html>
    <body>
    <h1>Links for vLLM</h1/>
        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
    </body>
 </html>
 """
 parser = argparse.ArgumentParser()
 parser.add_argument("--wheel", help="The wheel path.", required=True)
 args = parser.parse_args()
 filename = os.path.basename(args.wheel)
 with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
    if "x86_64" in filename:
        x86_wheel = filename
        arm_wheel = filename.replace("x86_64", "aarch64").replace(
            "manylinux1", "manylinux2014"
        )
    elif "aarch64" in filename:
        x86_wheel = filename.replace("aarch64", "x86_64").replace(
            "manylinux2014", "manylinux1"
        )
        arm_wheel = filename
    else:
        raise ValueError(f"Unsupported wheel: {filename}")
    # cloudfront requires escaping the '+' character
    f.write(
        template.format(
            x86_wheel=x86_wheel,
            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
            arm_wheel=arm_wheel,
            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
        )
    )
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,13 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.671
  - name: "exact_match,flexible-extract"
    value: 0.664
 limit: 1000
 num_fewshot: 5
 trust_remote_code: True
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,12 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.905
  - name: "exact_match,flexible-extract"
    value: 0.905
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,12 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.892
  - name: "exact_match,flexible-extract"
    value: 0.892
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.752
  - name: "exact_match,flexible-extract"
    value: 0.754
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.753
  - name: "exact_match,flexible-extract"
    value: 0.753
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.755
  - name: "exact_match,flexible-extract"
    value: 0.755
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.753
  - name: "exact_match,flexible-extract"
    value: 0.753
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.764
  - name: "exact_match,flexible-extract"
    value: 0.764
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.728
  - name: "exact_match,flexible-extract"
    value: 0.728
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.758
  - name: "exact_match,flexible-extract"
    value: 0.759
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,12 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.756
  - name: "exact_match,flexible-extract"
    value: 0.752
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.335
  - name: "exact_match,flexible-extract"
    value: 0.323
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.356
  - name: "exact_match,flexible-extract"
    value: 0.358
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.231
  - name: "exact_match,flexible-extract"
    value: 0.22
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.86
  - name: "exact_match,flexible-extract"
    value: 0.86
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.624
  - name: "exact_match,flexible-extract"
    value: 0.624
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,12 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.616
  - name: "exact_match,flexible-extract"
    value: 0.632
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.30
  - name: "exact_match,flexible-extract"
    value: 0.465
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.578
  - name: "exact_match,flexible-extract"
    value: 0.585
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.593
  - name: "exact_match,flexible-extract"
    value: 0.588
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.595
  - name: "exact_match,flexible-extract"
    value: 0.582
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.792
  - name: "exact_match,flexible-extract"
    value: 0.824
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.54
  - name: "exact_match,flexible-extract"
    value: 0.59
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.47
  - name: "exact_match,flexible-extract"
    value: 0.64
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.6353
  - name: "exact_match,flexible-extract"
    value: 0.637
 limit: null
 num_fewshot: null 
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -1,5 +0,0 @@
 Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
 Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,6 +0,0 @@
 Qwen2.5-1.5B-Instruct.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -1,44 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 import pytest
 def pytest_addoption(parser):
    parser.addoption(
        "--config-list-file",
        action="store",
        help="Path to the file listing model config YAMLs (one per line)",
    )
    parser.addoption(
        "--tp-size",
        action="store",
        default="1",
        help="Tensor parallel size to use for evaluation",
    )
@pytest.fixture(scope="session")
 def config_list_file(pytestconfig, config_dir):
    rel_path = pytestconfig.getoption("--config-list-file")
    return config_dir / rel_path
@pytest.fixture(scope="session")
 def tp_size(pytestconfig):
    return pytestconfig.getoption("--tp-size")
 def pytest_generate_tests(metafunc):
    if "config_filename" in metafunc.fixturenames:
        rel_path = metafunc.config.getoption("--config-list-file")
        config_list_file = Path(rel_path).resolve()
        config_dir = config_list_file.parent
        with open(config_list_file, encoding="utf-8") as f:
            configs = [
                config_dir / line.strip()
                for line in f
                if line.strip() and not line.startswith("#")
            ]
        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -1,46 +0,0 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``
    echo "Runs lm eval harness on GSM8k using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -b    - batch size to run the evaluation at"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo
 }
 while getopts "m:b:l:f:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    b ) 
        BATCH_SIZE="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    f ) 
        FEWSHOT="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model hf \
  --model_args "pretrained=$MODEL,parallelize=True" \
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -1,51 +0,0 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on GSM for vllm.
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``
    echo "Runs lm eval harness on GSM8k using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -b    - batch size to run the evaluation at"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    b ) 
        BATCH_SIZE="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    f ) 
        FEWSHOT="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,57 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
 pytest -s -v test_lm_eval_correctness.py \
    --config-list-file=configs/models-small.txt \
    --tp-size=1
 """
 import lm_eval
 import numpy as np
 import yaml
 RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len}"
    )
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
        batch_size="auto",
    )
    return results
 def test_lm_eval_correctness_param(config_filename, tp_size):
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
    results = launch_lm_eval(eval_config, tp_size)
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(
                f"{task['name']} | {metric['name']}: "
                f"ground_truth={ground_truth} | measured={measured_value}"
            )
            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
    assert success
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -1,177 +0,0 @@
 # vLLM benchmark suite
 ## Introduction
 This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 ## Performance benchmark quick overview
 **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
 **Benchmarking Duration**: about 1hr.
 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
 ## Nightly benchmark quick overview
 **Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
 **Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
 **Benchmarking Duration**: about 3.5hrs.
 ## Trigger the benchmark
 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 Manually Trigger the benchmark
 ```bash
 bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 Runtime environment variables:
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 ## Performance benchmark details
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
 >
 ### Latency test
 Here is an example of one test inside `latency-tests.json`:
 ```json
 [
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    },
 ]
 ```
 In this example:
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
 - The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
 ### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 ### Serving test
 We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 ```json
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
 ]
 ```
 Inside this example:
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
 - The `client-parameters` includes the command line arguments for `vllm bench serve`.
 - The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 ### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
 `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
 |----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
 | 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
 | 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
 A comparison diagram will be generated below the table.
 Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
 ## Nightly test details
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 ### Workflow
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
 - Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
 - The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
 - At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 ### Nightly tests
 In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
 ### Docker containers
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -1,184 +0,0 @@
 steps:
  - label: "Wait for container to be ready"
    key: wait-for-container-image
    agents:
      queue: A100
    plugins:
    - kubernetes:
        podSpec:
          containers:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - label: "Cleanup H100"
    agents:
      queue: H100
    depends_on: ~
    command: docker system prune -a --volumes --force
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
            volumeMounts:
            - name: devshm
              mountPath: /dev/shm
            env:
            - name: VLLM_USAGE_SOURCE
              value: ci-test
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
          nodeSelector:
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
          volumes:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  # Premerge benchmark
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
            volumeMounts:
            - name: devshm
              mountPath: /dev/shm
            env:
            - name: VLLM_USAGE_SOURCE
              value: ci-test
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
          nodeSelector:
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
          volumes:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -1,28 +0,0 @@
 # Nightly benchmark annotation
 ## Description
 This file contains the downloading link for benchmarking results.
 - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 - [benchmarking results](artifact://results.zip)
 - [benchmarking code](artifact://nightly-benchmarks.zip)
 Please download the visualization scripts in the post
 ## Results reproduction
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
    - Download `nightly-benchmarks.zip`.
    - In the same folder, run the following code:
    ```bash
    export HF_TOKEN=<your HF token>
    apt update
    apt install -y git
    unzip nightly-benchmarks.zip
    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
    ```
 And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,39 +0,0 @@
 # Nightly benchmark
 This benchmark aims to:
 - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
 - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 ## Setup
 - Docker images:
    - vLLM: `vllm/vllm-openai:v0.6.2`
    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
    - 8x Nvidia A100 GPUs
 - Workload:
    - Dataset
        - ShareGPT dataset
        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
    - Models: llama-3 8B, llama-3 70B.
        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 ## Known issues
 - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
 - TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -1,196 +0,0 @@
 common_pod_spec: &common_pod_spec
  priorityClassName: perf-benchmark
  nodeSelector:
    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
  volumes:
    - name: devshm
      emptyDir:
        medium: Memory
    - name: hf-cache
      hostPath:
        path: /root/.cache/huggingface
        type: Directory
 common_container_settings: &common_container_settings
  command:
    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  resources:
    limits:
      nvidia.com/gpu: 8
  volumeMounts:
    - name: devshm
      mountPath: /dev/shm
    - name: hf-cache
      mountPath: /root/.cache/huggingface
  env:
    - name: VLLM_USAGE_SOURCE
      value: ci-test
    - name: HF_HOME
      value: /root/.cache/huggingface
    - name: VLLM_SOURCE_CODE_LOC
      value: /workspace/build/buildkite/vllm/performance-benchmark
    - name: HF_TOKEN
      valueFrom:
        secretKeyRef:
          name: hf-token-secret
          key: token
 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
  - label: "A100 vllm step 10"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: vllm/vllm-openai:v0.6.2
                <<: *common_container_settings
  - label: "A100 sglang benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: lmsysorg/sglang:v0.3.2-cu121
                <<: *common_container_settings
  - label: "A100 lmdeploy benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: openmmlab/lmdeploy:v0.6.1-cu12
                <<: *common_container_settings
  - label: "A100 trt llama-8B"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama8B"
  - label: "A100 trt llama-70B"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama70B"
  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
  # - label: "A100 trt benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
  #               <<: *common_container_settings
  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
  # - label: "A100 tgi benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
  #               <<: *common_container_settings
  - wait
  - label: "Collect the results"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
            - image: vllm/vllm-openai:v0.5.0.post1
              command:
              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
              resources:
                limits:
                  nvidia.com/gpu: 8
              volumeMounts:
              - name: devshm
                mountPath: /dev/shm
              env:
              - name: VLLM_USAGE_SOURCE
                value: ci-test
              - name: VLLM_SOURCE_CODE_LOC
                value: /workspace/build/buildkite/vllm/performance-benchmark
              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
                    key: token
  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -1,65 +0,0 @@
 # Performance benchmarks descriptions
 ## Latency tests
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
 - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 {latency_tests_markdown_table}
 ## Throughput tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
 - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput.
 {throughput_tests_markdown_table}
 ## Serving tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 {serving_tests_markdown_table}
 ## Platform Information
 {platform_markdown_table}
 ## json version of the benchmarking tables
 This section contains the data of the markdown tables above in JSON format.
 You can load the benchmarking tables into pandas dataframes as follows:
 ```python
 import json
 import pandas as pd
 benchmarking_results_json = """The json string"""
 benchmarking_results = json.loads(benchmarking_results_json)
 latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
 throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
 serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
 ```
 The json string for all benchmarking tables:
 ```json
 {benchmarking_results_in_json_string}
 ```
 You can also check the raw experiment data in the Artifact tab of the Buildkite page.
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@ -1,307 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
 from importlib import util
 import pandas as pd
 plotly_found = util.find_spec("plotly.express") is not None
 def compare_data_columns(
    files, name_column, data_column, info_cols, drop_column, debug=False
 ):
    """
    Align concatenation by keys derived from info_cols instead of row order.
    - Pick one canonical key list: subset of info_cols present in ALL files.
    - For each file: set index to those keys, aggregate duplicates
    - (mean for metric, first for names).
    - Concat along axis=1 (indexes align), then reset_index so callers can
    - group by columns.
    - If --debug, add a <file_label>_name column per file.
    """
    print("\ncompare_data_column:", data_column)
    frames = []
    raw_data_cols = []
    compare_frames = []
    # 1) choose a canonical key list from info_cols that exists in ALL files
    cols_per_file = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
        cols_per_file.append(set(df_tmp.columns))
    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
    if not key_cols:
        # soft fallback: use any info_cols present in the first file
        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
    if not key_cols:
        raise ValueError(
            "No common key columns found from info_cols across the input files."
        )
    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
    meta_added = False
    for file in files:
        df = pd.read_json(file, orient="records")
        # Keep rows that actually have the compared metric (same as original behavior)
        if drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)
        # Stabilize numeric key columns (harmless if missing)
        for c in (
            "Input Len",
            "Output Len",
            "TP Size",
            "PP Size",
            "# of max concurrency.",
            "qps",
        ):
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
        # Ensure all key columns exist
        for c in key_cols:
            if c not in df.columns:
                df[c] = pd.NA
        # Set index = key_cols and aggregate duplicates → unique MultiIndex
        df_idx = df.set_index(key_cols, drop=False)
        # meta (key columns), unique per key
        meta = df_idx[key_cols]
        if not meta.index.is_unique:
            meta = meta.groupby(level=key_cols, dropna=False).first()
        # metric series for this file, aggregated to one row per key
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
        s = df_idx[data_column]
        if not s.index.is_unique:
            s = s.groupby(level=key_cols, dropna=False).mean()
        s.name = file_label  # column label like original
        # add meta once (from first file) so keys are the leftmost columns
        if not meta_added:
            frames.append(meta)
            meta_added = True
        # (NEW) debug: aligned test-name column per file
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
            frames.append(name_s)
        frames.append(s)
        raw_data_cols.append(file_label)
        compare_frames.append(s)
        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
            ratio = current / base
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
    # 4) concat on columns with aligned MultiIndex;
    # then reset_index to return keys as columns
    concat_df = pd.concat(frames, axis=1)
    concat_df = concat_df.reset_index(drop=True).reset_index()
    if "index" in concat_df.columns:
        concat_df = concat_df.drop(columns=["index"])
    # Ensure key/info columns appear first (in your info_cols order)
    front = [c for c in info_cols if c in concat_df.columns]
    rest = [c for c in concat_df.columns if c not in front]
    concat_df = concat_df[front + rest]
    print(raw_data_cols)
    return concat_df, raw_data_cols
 def split_json_by_tp_pp(
    input_file: str = "benchmark_results.json", output_root: str = "."
 ) -> list[str]:
    """
    Split a benchmark JSON into separate folders by (TP Size, PP Size).
    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
    Returns: list of file paths written.
    """
    # Load JSON data into DataFrame
    with open(input_file, encoding="utf-8") as f:
        data = json.load(f)
    # If the JSON is a dict with a list under common keys, use that list
    if isinstance(data, dict):
        for key in ("results", "serving_results", "benchmarks", "data"):
            if isinstance(data.get(key), list):
                data = data[key]
                break
    df = pd.DataFrame(data)
    # Keep only "serving" tests
    name_col = next(
        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
    )
    if name_col:
        df = df[
            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
        ].copy()
    # Handle alias column names
    rename_map = {
        "tp_size": "TP Size",
        "tensor_parallel_size": "TP Size",
        "pp_size": "PP Size",
        "pipeline_parallel_size": "PP Size",
    }
    df.rename(
        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
    )
    # Ensure TP/PP columns exist (default to 1 if missing)
    if "TP Size" not in df.columns:
        df["TP Size"] = 1
    if "PP Size" not in df.columns:
        df["PP Size"] = 1
    # make sure TP/PP are numeric ints with no NaN
    df["TP Size"] = (
        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
    )
    df["PP Size"] = (
        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
    )
    # Split into separate folders
    saved_paths: list[str] = []
    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
        os.makedirs(folder_name, exist_ok=True)
        filepath = os.path.join(folder_name, "benchmark_results.json")
        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
        print(f"Saved: {filepath}")
        saved_paths.append(filepath)
    return saved_paths
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", action="append", type=str, help="input file name"
    )
    parser.add_argument(
        "--debug", action="store_true", help="show all information for debugging"
    )
    parser.add_argument(
        "--plot",
        action=argparse.BooleanOptionalAction,
        default=True,
        help="plot perf diagrams or not --no-plot --plot",
    )
    parser.add_argument(
        "-x",
        "--xaxis",
        type=str,
        default="# of max concurrency.",
        help="column name to use as X Axis in comparison graph",
    )
    args = parser.parse_args()
    drop_column = "P99"
    name_column = "Test name"
    info_cols = [
        "Model",
        "Dataset Name",
        "Input Len",
        "Output Len",
        "TP Size",
        "PP Size",
        "# of max concurrency.",
        "qps",
    ]
    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
    html_msgs_for_data_cols = [
        "Compare Output Tokens /n",
        "Median TTFT /n",
        "Median TPOT /n",
    ]
    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
    else:
        files = args.file
    print("comparing : " + ", ".join(files))
    debug = args.debug
    plot = args.plot
    # For Plot feature, assign y axis from one of info_cols
    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
    with open("perf_comparison.html", "w") as text_file:
        for i in range(len(data_cols_to_compare)):
            output_df, raw_data_cols = compare_data_columns(
                files,
                name_column,
                data_cols_to_compare[i],
                info_cols,
                drop_column,
                debug=debug,
            )
            # For Plot feature, insert y axis from one of info_cols
            raw_data_cols.insert(0, info_cols[y_axis_index])
            filtered_info_cols = info_cols[:-2]
            existing_group_cols = [
                c for c in filtered_info_cols if c in output_df.columns
            ]
            if not existing_group_cols:
                raise ValueError(
                    f"No valid group-by columns  "
                    f"Expected subset: {filtered_info_cols}, "
                    f"but DataFrame has: {list(output_df.columns)}"
                )
            output_df_sorted = output_df.sort_values(by=existing_group_cols)
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
            for name, group in output_groups:
                html = group.to_html()
                text_file.write(html_msgs_for_data_cols[i])
                text_file.write(html)
                if plot and plotly_found:
                    import plotly.express as px
                    df = group[raw_data_cols]
                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
                    # Melt DataFrame for plotting
                    df_melted = df_sorted.melt(
                        id_vars=info_cols[y_axis_index],
                        var_name="Configuration",
                        value_name=data_cols_to_compare[i],
                    )
                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
                    # Create Plotly line chart
                    fig = px.line(
                        df_melted,
                        x=info_cols[y_axis_index],
                        y=data_cols_to_compare[i],
                        color="Configuration",
                        title=title,
                        markers=True,
                    )
                    # Export to HTML
                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,412 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
 import shlex
 from importlib import util
 from pathlib import Path
 from typing import Any
 import pandas as pd
 import psutil
 import regex as re
 from tabulate import tabulate
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "avg_latency": "Mean latency (ms)",
    # "P10": "P10 (s)",
    # "P25": "P25 (s)",
    "P50": "Median latency (ms)",
    # "P75": "P75 (s)",
    # "P90": "P90 (s)",
    "P99": "P99 latency (ms)",
 }
 # throughput tests and the keys that will be printed into markdown
 throughput_results = []
 throughput_results_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "num_requests": "# of req.",
    "total_num_tokens": "Total # of tokens",
    "elapsed_time": "Elapsed time (s)",
    "requests_per_second": "Tput (req/s)",
    "tokens_per_second": "Tput (tok/s)",
 }
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "model_id": "Model",
    "dataset_name": "Dataset Name",
    "input_len": "Input Len",
    "output_len": "Output Len",
    "tp_size": "TP Size",
    "pp_size": "PP Size",
    "dtype": "dtype",
    "gpu_type": "GPU",
    "completed": "# of req.",
    "qps": "qps",
    "max_concurrency": "# of max concurrency.",
    "request_throughput": "Tput (req/s)",
    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
    # "total_input_tokens": "Total input tokens",
    # "total_output_tokens": "Total output tokens",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "median_tpot_ms": "Median",
    "p99_tpot_ms": "P99",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
 }
 def read_markdown(file):
    if os.path.exists(file):
        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
 def results_to_json(latency, throughput, serving):
    return json.dumps(
        {
            "latency": latency.to_dict(),
            "throughput": throughput.to_dict(),
            "serving": serving.to_dict(),
        }
    )
 def get_size_with_unit(bytes, suffix="B"):
    """
    Scale bytes to its proper format
    e.g:
        1253656 => '1.20MB'
        1253656678 => '1.17GB'
    """
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
 def _coerce(val: str) -> Any:
    """Best-effort type coercion from string to Python types."""
    low = val.lower()
    if low == "null":
        return None
    if low == "true":
        return True
    if low == "false":
        return False
    # integers
    if re.fullmatch(r"[+-]?\d+", val):
        try:
            return int(val)
        except ValueError:
            pass
    # floats (keep 'inf'/'-inf'/'nan' as strings)
    if re.fullmatch(r"[+-]?\d*\.\d+", val):
        try:
            return float(val)
        except ValueError:
            pass
    return val
 def parse_client_command(cmd: str) -> dict[str, Any]:
    """Parse the client_command shell string into {executable, script, args}."""
    toks = shlex.split(cmd)
    if len(toks) < 2:
        raise ValueError("client_command must include an executable and a script")
    executable, script = toks[0], toks[1]
    args: dict[str, Any] = {}
    i = 2
    while i < len(toks):
        t = toks[i]
        if t.startswith("--"):
            # --key=value or --key (value) or boolean flag
            if "=" in t:
                key, val = t.split("=", 1)
                if key == "--metadata":
                    md = {}
                    if val:
                        if "=" in val:
                            k, v = val.split("=", 1)
                            md[k] = _coerce(v)
                        else:
                            md[val] = True
                    args[key] = md
                else:
                    args[key] = _coerce(val)
                i += 1
                continue
            key = t
            # Special: consume metadata k=v pairs until next --flag
            if key == "--metadata":
                i += 1
                md = {}
                while i < len(toks) and not toks[i].startswith("--"):
                    pair = toks[i]
                    if "=" in pair:
                        k, v = pair.split("=", 1)
                        md[k] = _coerce(v)
                    else:
                        md[pair] = True
                    i += 1
                args[key] = md
                continue
            # Standard: check if next token is a value (not a flag)
            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
                args[key] = _coerce(toks[i + 1])
                i += 2
            else:
                # lone flag -> True
                args[key] = True
                i += 1
        else:
            # unexpected positional; skip
            i += 1
    return {"executable": executable, "script": script, "args": args}
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-r",
        "--result",
        type=str,
        default="results",
        help="Folder name for benchmark output results.",
    )
    args = parser.parse_args()
    results_folder = Path(args.result)
    if not results_folder.exists():
        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
            # this result is generated via `vllm bench serve` command
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            # Parse Server Command Arg
            out: dict[str, Any] = {
                "server_command": parse_client_command(command["server_command"])
            }
            parse_args = [
                "--tensor-parallel-size",
                "--pipeline-parallel-size",
                "--dtype",
            ]
            col_mapping = ["tp_size", "pp_size", "dtype"]
            for index, arg in enumerate(parse_args):
                if arg in out["server_command"]["args"]:
                    raw_result.update(
                        {col_mapping[index]: out["server_command"]["args"][arg]}
                    )
            # Parse Client Command Arg
            out: dict[str, Any] = {
                "client_command": parse_client_command(command["client_command"])
            }
            parse_args = [
                "--dataset-name",
                "--random-input-len",
                "--random-output-len",
                "--request-rate",
            ]
            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
            for index, arg in enumerate(parse_args):
                if arg in out["client_command"]["args"]:
                    raw_result.update(
                        {col_mapping[index]: out["client_command"]["args"][arg]}
                    )
            # Add Server, Client command
            raw_result.update(command)
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            serving_results.append(raw_result)
            continue
        elif "latency" in f.name:
            # this result is generated via `vllm bench latency` command
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # get different percentiles
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
            # add the result to raw_result
            latency_results.append(raw_result)
            continue
        elif "throughput" in f.name:
            # this result is generated via `vllm bench throughput` command
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            throughput_results.append(raw_result)
            continue
        print(f"Skipping {test_file}")
    latency_results = pd.DataFrame.from_dict(latency_results)
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)
    svmem = psutil.virtual_memory()
    platform_data = {
        "Physical cores": [psutil.cpu_count(logical=False)],
        "Total cores": [psutil.cpu_count(logical=True)],
        "Total Memory": [get_size_with_unit(svmem.total)],
    }
    if util.find_spec("numa") is not None:
        from numa import info
        platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
    if util.find_spec("cpuinfo") is not None:
        from cpuinfo import get_cpu_info
        platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
    platform_results = pd.DataFrame.from_dict(
        platform_data, orient="index", columns=["Platform Info"]
    )
    raw_results_json = results_to_json(
        latency_results, throughput_results, serving_results
    )
    # remapping the key, for visualization purpose
    if not latency_results.empty:
        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
            columns=latency_column_mapping
        )
    if not serving_results.empty:
        valid_columns = [
            col for col in serving_column_mapping if col in serving_results.columns
        ]
        serving_results = serving_results[valid_columns].rename(
            columns=serving_column_mapping
        )
    if not throughput_results.empty:
        throughput_results = throughput_results[
            list(throughput_results_column_mapping.keys())
        ].rename(columns=throughput_results_column_mapping)
    processed_results_json = results_to_json(
        latency_results, throughput_results, serving_results
    )
    for df in [latency_results, serving_results, throughput_results]:
        if df.empty:
            continue
        # Sort all dataframes by their respective "Test name" columns
        df.sort_values(by="Test name", inplace=True)
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
        )
    # get markdown tables
    latency_md_table = tabulate(
        latency_results, headers="keys", tablefmt="pipe", showindex=False
    )
    serving_md_table = tabulate(
        serving_results, headers="keys", tablefmt="pipe", showindex=False
    )
    throughput_md_table = tabulate(
        throughput_results, headers="keys", tablefmt="pipe", showindex=False
    )
    platform_md_table = tabulate(
        platform_results, headers="keys", tablefmt="pipe", showindex=True
    )
    # document the result
    md_file = "benchmark_results.md"
    json_file = "benchmark_results.json"
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/nightly-benchmarks/"
            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
            platform_markdown_table=platform_md_table,
            benchmarking_results_in_json_string=processed_results_json,
        )
        f.write(results)
    # document benchmarking results in json
    with open(results_folder / json_file, "w") as f:
        results = (
            latency_results.to_dict(orient="records")
            + throughput_results.to_dict(orient="records")
            + serving_results.to_dict(orient="records")
        )
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,26 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 from transformers import AutoTokenizer
 def main(model, cachedir):
    # Load the tokenizer and save it to the specified directory
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.save_pretrained(cachedir)
    print(f"Tokenizer saved to {cachedir}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Download and save Hugging Face tokenizer"
    )
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
    parser.add_argument(
        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
    )
    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,97 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
        description="Parse command line arguments for summary-nightly-results script."
    )
    parser.add_argument(
        "--results-folder",
        type=str,
        required=True,
        help="The folder where the results are stored.",
    )
    parser.add_argument(
        "--description", type=str, required=True, help="Description of the results."
    )
    args = parser.parse_args()
    return args
 def get_perf(df, method, model, metric):
    means = []
    for qps in [2, 4, 8, 16, "inf"]:
        target = df["Test name"].str.contains(model)
        target = target & df["Engine"].str.contains(method)
        target = target & df["Test name"].str.contains("qps_" + str(qps))
        filtered_df = df[target]
        if filtered_df.empty:
            means.append(0.0)
        else:
            means.append(filtered_df[metric].values[0])
    return np.array(means)
 def get_perf_w_std(df, method, model, metric):
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
        std = get_perf(df, method, model, "Std " + metric + " (ms)")
        if std.mean() == 0:
            std = None
        success = get_perf(df, method, model, "Successful req.")
        if std is not None:
            std = std / np.sqrt(success)
            std = std.tolist()
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
            df, method, model, "Output Tput (tok/s)"
        )
        mean = mean.tolist()
        std = None
    return mean, std
 def main(args):
    results_folder = Path(args.results_folder)
    results = []
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
        with open(test_file) as f:
            results = results + json.loads(f.read())
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
    with open(args.description) as f:
        description = f.read()
    description = description.format(nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
 if __name__ == "__main__":
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,9 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from lmdeploy.serve.openai.api_client import APIClient
 api_client = APIClient("http://localhost:8000")
 model_name = api_client.available_models[0]
 print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -1,224 +0,0 @@
 #!/bin/bash
 # Currently FP8 benchmark is NOT enabled.
 set -x
 server_params=$1
 common_params=$2
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 launch_trt_server() {
  model_path=$(echo "$common_params" | jq -r '.model')
  model_name="${model_path#*/}"
  model_type=$(echo "$server_params" | jq -r '.model_type')
  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
  model_tp_size=$(echo "$common_params" | jq -r '.tp')
  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
  # create model caching directory
  cd ~
  rm -rf models
  mkdir -p models
  cd models
  models_dir=$(pwd)
  trt_model_path=${models_dir}/${model_name}-trt-ckpt
  trt_engine_path=${models_dir}/${model_name}-trt-engine
  # clone tensorrt backend
  cd /
  rm -rf tensorrtllm_backend
  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
  git lfs install
  cd tensorrtllm_backend
  git checkout "$trt_llm_version"
  git submodule update --init --recursive
  # build trtllm engine
  cd /tensorrtllm_backend
  cd "./tensorrt_llm/examples/${model_type}"
  python3 convert_checkpoint.py \
    --model_dir "${model_path}" \
    --dtype "${model_dtype}" \
    --tp_size "${model_tp_size}" \
    --output_dir "${trt_model_path}"
  trtllm-build \
    --checkpoint_dir "${trt_model_path}" \
    --use_fused_mlp \
    --reduce_fusion disable \
    --workers 8 \
    --gpt_attention_plugin "${model_dtype}" \
    --gemm_plugin "${model_dtype}" \
    --tp_size "${model_tp_size}" \
    --max_batch_size "${max_batch_size}" \
    --max_input_len "${max_input_len}" \
    --max_seq_len "${max_seq_len}" \
    --max_num_tokens "${max_num_tokens}" \
    --output_dir "${trt_engine_path}"
  # handle triton protobuf files and launch triton server
  cd /tensorrtllm_backend
  mkdir triton_model_repo
  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
  cd triton_model_repo
  rm -rf ./tensorrt_llm/1/*
  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
  cd /tensorrtllm_backend
  python3 scripts/launch_triton_server.py \
    --world_size="${model_tp_size}" \
    --model_repo=/tensorrtllm_backend/triton_model_repo &
 }
 launch_tgi_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params."
    server_command="/tgi-entrypoint.sh \
                --model-id $model \
                --num-shard $tp \
                --port $port \
                --quantize fp8 \
                $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="/tgi-entrypoint.sh \
                --model-id $model \
                --num-shard $tp \
                --port $port \
                $server_args"
  fi
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 launch_lmdeploy_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  server_command="lmdeploy serve api_server $model \
    --tp $tp \
    --server-port $port \
    $server_args"
  # run the server
  echo "Server command: $server_command"
  bash -c "$server_command" &
 }
 launch_sglang_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
    server_command="python3 \
        -m sglang.launch_server \
        --tp $tp \
        --model-path $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="python3 \
        -m sglang.launch_server \
        --tp $tp \
        --model-path $model \
        --port $port \
        $server_args"
  fi
  # run the server
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 launch_vllm_server() {
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
    server_command="vllm serve $model \
        -tp $tp \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="vllm serve $model \
        -tp $tp \
        --port $port \
        $server_args"
  fi
  # run the server
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 main() {
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
    launch_trt_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
    launch_tgi_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
    launch_lmdeploy_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
    launch_sglang_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
    launch_vllm_server
  fi
 }
 main
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -1,78 +0,0 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 main() {
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
    (which zip) || (apt-get install -y zip)
    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
        exit 0
    fi
    # initial annotation
    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
    # download results
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/
    # upload benchmark results
    zip -r results.zip results/
    /workspace/buildkite-agent artifact upload "results.zip"
    # upload benchmarking scripts
    cd "$VLLM_SOURCE_CODE_LOC/"
    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    # upload benchmarking pipeline
    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
    # The figures should be generated by a separate process outside the CI/CD pipeline
    # # generate figures
    # python3 -m pip install tabulate pandas matplotlib
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
    #     --description $description \
    #     --results-folder results/ 
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sharegpt
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_2048_128
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_128_2048
    # # upload results and figures
    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -1,464 +0,0 @@
 #!/bin/bash
 set -o pipefail
 set -x
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
  echo "GPU type is $gpu_type"
 }
 check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: HF_TOKEN is not set."
    exit 1
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
    echo "Error: HF_TOKEN does not start with 'hf_'."
    exit 1
  else
    echo "HF_TOKEN is set and valid."
  fi
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 get_current_llm_serving_engine() {
  if which lmdeploy >/dev/null; then
    echo "Container: lmdeploy"
    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
    return
  fi
  if [ -e /tgi-entrypoint.sh ]; then
    echo "Container: tgi"
    export CURRENT_LLM_SERVING_ENGINE=tgi
    return
  fi
  if which trtllm-build >/dev/null; then
    echo "Container: tensorrt-llm"
    export CURRENT_LLM_SERVING_ENGINE=trt
    return
  fi
  if [ -e /sgl-workspace ]; then
    echo "Container: sglang"
    export CURRENT_LLM_SERVING_ENGINE=sglang
    return
  fi
  if [ -e /vllm-workspace ]; then
    echo "Container: vllm"
    # move to a completely irrelevant directory, to avoid import vllm from current folder
    export CURRENT_LLM_SERVING_ENGINE=vllm
    return
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 kill_gpu_processes() {
  pkill -f '[p]ython'
  pkill -f '[p]ython3'
  pkill -f '[t]ritonserver'
  pkill -f '[p]t_main_thread'
  pkill -f '[t]ext-generation'
  pkill -f '[l]mdeploy'
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pkill -f '[V]LLM'
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
  done
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 ensure_installed() {
  # Ensure that the given command is installed by apt-get
  local cmd=$1
  if ! which "$cmd" >/dev/null; then
    apt-get update && apt-get install -y "$cmd"
  fi
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # prepare tokenizer
    # this is required for lmdeploy.
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    # change model name for lmdeploy (it will not follow standard hf name)
    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ $backend = "trt" ]]; then
        backend="tensorrt-llm"
      fi
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      if [[ "$dataset_name" = "sharegpt" ]]; then
        client_command="vllm bench serve \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      elif [[ "$dataset_name" = "sonnet" ]]; then
        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
        client_command="vllm bench serve \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --sonnet-input-len $sonnet_input_len \
          --sonnet-output-len $sonnet_output_len \
          --sonnet-prefix-len $sonnet_prefix_len \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      else
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
        exit 1
      fi
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      server_command="None"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
  done
  kill_gpu_processes
 }
 run_genai_perf_tests() {
  # run genai-perf tests
  # $1: a json file specifying genai-perf test cases
  local genai_perf_test_file
  genai_perf_test_file=$1
  # Iterate over genai-perf tests
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps=$num_prompts
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      #TODO: add output dir.
      client_command="genai-perf profile \
        -m $model \
        --service-kind openai \
        --backend "$backend" \
        --endpoint-type chat \
        --streaming \
        --url localhost:$port \
        --request-rate $qps \
        --num-prompts $num_prompts \
      "
    echo "Client command: $client_command"
    eval "$client_command"
    #TODO: process/record outputs
    done
  done
  kill_gpu_processes
 }
 prepare_dataset() {
  # download sharegpt dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  echo "" > sonnet_4x.txt
  for _ in {1..4}
  do
    cat sonnet.txt >> sonnet_4x.txt
  done
 }
 main() {
  # check if the environment variable is successfully injected from yaml
  check_gpus
  check_hf_token
  get_current_llm_serving_engine
  pip install -U transformers
  pip install -r requirements/dev.txt
  which genai-perf
  # check storage
  df -h
  ensure_installed wget
  ensure_installed curl
  ensure_installed jq
  # genai-perf dependency
  ensure_installed libb64-0d
  prepare_dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
  # run the test
  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
  # run genai-perf tests
  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
  mv artifacts/ $RESULTS_FOLDER/
  # upload benchmark results to buildkite
  python3 -m pip install tabulate pandas
  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -1,486 +0,0 @@
 #!/bin/bash
 # This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/
 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
 set -x
 set -o pipefail
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  fi
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
  fi
  echo "GPU type is $gpu_type"
 }
 check_cpus() {
  # check the number of CPUs and NUMA Node and GPU type.
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
  fi
  declare -g gpu_type="cpu"
  echo "GPU type is $gpu_type"
 }
 check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: HF_TOKEN is not set."
    exit 1
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
    echo "Error: HF_TOKEN does not start with 'hf_'."
    exit 1
  else
    echo "HF_TOKEN is set and valid."
  fi
 }
 ensure_sharegpt_downloaded() {
  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
  if [ ! -f "$FILE" ]; then
    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
  else
    echo "$FILE already exists."
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 json2envs() {
  # transforms the JSON string to environment variables.
  # example:
  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
  # output: VLLM_CPU_KVCACHE_SPACE=5
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map((.key ) + "=" + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
 }
 kill_processes_launched_by_current_bash() {
  # Kill all python processes launched from current bash script
  current_shell_pid=$$
  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
  if [ -n "$processes" ]; then
    echo "Killing the following processes matching '$1':"
    echo "$processes"
    echo "$processes" | xargs kill -9
  else
    echo "No processes found matching '$1'."
  fi
 }
 kill_gpu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pgrep VLLM | xargs -r kill -9
  # wait until GPU memory usage smaller than 1GB
  if command -v nvidia-smi; then
    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
      sleep 1
    done
  elif command -v amd-smi; then
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
      sleep 1
    done
  fi
  # remove vllm config file
  rm -rf ~/.config/vllm
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
  if command -v buildkite-agent >/dev/null 2>&1; then
    BUILDKITE_AGENT_COMMAND="buildkite-agent"
  elif [ -f /workspace/buildkite-agent ]; then
    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
  else
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # Use the determined command to annotate and upload artifacts
  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 run_latency_tests() {
  # run latency tests using `vllm bench latency` command
  # $1: a json file specifying latency test cases
  local latency_test_file
  latency_test_file=$1
  # Iterate over latency tests
  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^latency_ ]]; then
      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    latency_params=$(echo "$params" | jq -r '.parameters')
    latency_args=$(json2args "$latency_params")
    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    latency_envs=$(json2envs "$latency_environment_variables")
    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [ "$ON_CPU" == "1" ]; then
      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
    latency_command=" $latency_envs vllm bench latency \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $latency_args"
    echo "Running test case $test_name"
    echo "Latency command: $latency_command"
    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg latency "$latency_command" \
      --arg gpu "$gpu_type" \
      '{
        latency_command: $latency,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$latency_command"
    kill_gpu_processes
  done
 }
 run_throughput_tests() {
  # run throughput tests using `vllm bench throughput`
  # $1: a json file specifying throughput test cases
  local throughput_test_file
  throughput_test_file=$1
  # Iterate over throughput tests
  jq -c '.[]' "$throughput_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^throughput_ ]]; then
      echo "In throughput-test.json, test_name must start with \"throughput_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    throughput_params=$(echo "$params" | jq -r '.parameters')
    throughput_args=$(json2args "$throughput_params")
    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    throughput_envs=$(json2envs "$throughput_environment_variables")
    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
    if [ "$ON_CPU" == "1" ]; then
      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
    throughput_command=" $throughput_envs vllm bench throughput \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"
    echo "Running test case $test_name"
    echo "Throughput command: $throughput_command"
    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg command "$throughput_command" \
      --arg gpu "$gpu_type" \
      '{
        throughput_command: $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$throughput_command"
    kill_gpu_processes
  done
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
      echo "In serving-test.json, test_name must start with \"serving_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
        max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"
    # check if there is enough resources to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
    if [ "$ON_CPU" == "1" ]; then
      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
    # check if server model and client model is aligned
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
    server_command="$server_envs vllm serve \
      $server_args"
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
    if [[ -z "${REMOTE_HOST}" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
      if wait_for_server; then
        echo ""
        echo "vLLM server is up and running."
      else
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
      else
        client_remote_args=" --host=$REMOTE_HOST "
      fi
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
        # pass the tensor parallel size to the client so that it can be displayed
        # on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
          --metadata "tensor_parallel_size=$tp" \
          $client_args $client_remote_args "
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
        bash -c "$client_command"
        # record the benchmarking commands
        jq_output=$(jq -n \
          --arg server "$server_command" \
          --arg client "$client_command" \
          --arg gpu "$gpu_type" \
          '{
            server_command: $server,
            client_command: $client,
            gpu_type: $gpu
          }')
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
      done
    done
    # clean up
    kill -9 $server_pid
    kill_gpu_processes
  done
 }
 main() {
  local ARCH
  ARCH=''
  if [ "$ON_CPU" == "1" ];then
     check_cpus
     ARCH='-cpu'
  else
     check_gpus
  fi
  check_hf_token
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
  (which lsof) || (apt-get update && apt-get install -y lsof)
  # get the current IP address, required by `vllm bench serve` command
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
  export VLLM_LOGGING_LEVEL="WARNING"
  # prepare for benchmarking
  cd benchmarks || exit 1
  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,82 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
 import json
 import os
 from pathlib import Path
 import pandas as pd
 from tabulate import tabulate
 results_folder = Path("results/")
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "completed": "Successful req.",
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "std_tpot_ms": "Std TPOT (ms)",
    "median_tpot_ms": "Median TPOT (ms)",
    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
    "total_input_tokens": "Total input tokens",
    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)
        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})
        # add the result to raw_result
        serving_results.append(raw_result)
        continue
    serving_results = pd.DataFrame.from_dict(serving_results)
    if not serving_results.empty:
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
            columns=serving_column_mapping
        )
    serving_md_table_with_headers = tabulate(
        serving_results, headers="keys", tablefmt="pipe", showindex=False
    )
    # remove the first line of header
    serving_md_table_lines = serving_md_table_with_headers.split("\n")
    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
    # document benchmarking results in markdown
    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
        f.write("\n")
    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
        results = serving_results.to_dict(orient="records")
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,23 +0,0 @@
 #!/bin/sh
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
 if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 else
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 fi
 TIMEOUT_SECONDS=10
 retries=0
 while [ $retries -lt 1000 ]; do
    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
        exit 0
    fi
    echo "Waiting for image to be available..."
    retries=$((retries + 1))
    sleep 5
 done
 exit 1
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@ -1,21 +0,0 @@
 [
    {
        "test_name": "llama8B_tp1_genai_perf",
        "qps_list": [4,8,16,32],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "port": 8000,
            "num_prompts": 500,
            "reuse_server": false
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "genai_perf_input_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@ -1,30 +0,0 @@
 [
    {
        "test_name": "latency_llama8B_tp1",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    },
    {
        "test_name": "latency_llama8B_tp4",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -1,32 +0,0 @@
 [
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    },
    {
        "test_name": "latency_llama70B_tp4",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num-iters-warmup": 5,
            "num-iters": 15
        }
    },
    {
        "test_name": "latency_mixtral8x7B_tp2",
        "parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "load_format": "dummy",
            "num-iters-warmup": 5,
            "num-iters": 15
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -1,311 +0,0 @@
 [
    {
        "test_name": "llama8B_tp1_sharegpt",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
            "port": 8000,
            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama8B_tp1_sonnet_512_16",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 16,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama8B_tp1_sonnet_512_256",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 256,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sharegpt",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
            "port": 8000,
            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sonnet_512_16",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 16,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sonnet_512_256",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 256,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,610 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -1,820 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@ -1,168 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 100
        }
    },
    {
        "test_name": "serving_llama8B_pp6_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 6,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 100
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -1,77 +0,0 @@
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama70B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
        "qps_list": [2],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
                "draft_tensor_parallel_size": 1
            }
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200 
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@ -1,32 +0,0 @@
 [
    {
        "test_name": "throughput_llama8B_tp1",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    },
    {
        "test_name": "throughput_llama8B_tp4",
        "environment_variables": {
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -1,35 +0,0 @@
 [
    {
        "test_name": "throughput_llama8B_tp1",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    },
    {
        "test_name": "throughput_llama70B_tp4",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    },
    {
        "test_name": "throughput_mixtral8x7B_tp2",
        "parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,171 +0,0 @@
 steps:
  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build arm64 wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.6"
    depends_on: ~
    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build release image (x86)"
    depends_on: ~
    id: build-release-image-x86
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
  - label: "Create multi-arch manifest"
    depends_on:
      - build-release-image-x86
      - build-release-image-arm64
    id: create-multi-arch-manifest
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Annotate release workflow"
    depends_on:
      - create-multi-arch-manifest
      - build-wheel-cuda-12-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-release.sh"
  - label: "Build and publish TPU release image"
    depends_on: ~
    if: build.env("NIGHTLY") == "1"
    agents:
      queue: tpu_queue_postmerge
    commands:
      - "yes | docker system prune -a"
      - "git fetch --all"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
      - docker-login#v3.0.0:
          username: vllmbot
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version
  - block: "Build CPU release image"
    key: block-cpu-release-image-build
    depends_on: ~
  - label: "Build and publish CPU release image"
    depends_on: block-cpu-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
    if: build.env("NIGHTLY") == "1"
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
      - "docker push vllm/vllm-openai:nightly-x86_64"
      - "docker push vllm/vllm-openai:nightly-aarch64"
      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest push vllm/vllm-openai:nightly"
      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
      - docker-login#v3.0.0:
          username: vllmbot
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -0,0 +1,73 @@
 # This script runs test inside the corresponding ROCm docker container.
 set -ex
 # Print ROCm version
 echo "--- ROCm info"
 rocminfo
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes
    docker volume prune -f
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 # Call the cleanup docker function
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Building container"
 sha=$(git rev-parse --short HEAD)
 image_name=rocm_${sha}
 container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
 docker build \
        -t ${image_name} \
        -f Dockerfile.rocm \
        --progress plain \
        .
 remove_docker_container() {
   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
 }
 trap remove_docker_container EXIT
 echo "--- Running container"
 docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --rm \
        -e HF_TOKEN \
        --name ${container_name} \
        ${image_name} \
        /bin/bash -c "${@}"
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -1,30 +1,28 @@
 #!/bin/bash
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite
 set -ex
 set -o pipefail
-# cd 2 levels into the working directory
+# cd into parent directory of this file
-cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # run python-based benchmarks and upload the result to buildkite
-vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
-vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-vllm bench serve \
+python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name sharegpt \
    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
@ -52,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md
 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /usr/bin/buildkite-agent ]; then
+if [ ! -f /workspace/buildkite-agent ]; then
    exit 0
 fi
 # upload the results to buildkite
-buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@ -77,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
 fi
 rm ShareGPT_V3_unfiltered_cleaned_split.json
-buildkite-agent artifact upload "*.json"
+/workspace/buildkite-agent artifact upload "*.json"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -0,0 +1,24 @@
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.cpu .
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image
 docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
  pip install pytest Pillow protobuf
  bash ../.buildkite/download-images.sh
  cd ../
  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -0,0 +1,51 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
 if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
        docker system prune -f
        echo $current_time > /tmp/neuron-docker-build-timestamp
    fi
 else
    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
 fi
 docker build -t neuron -f Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() { docker rm -f neuron || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image
 docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
 # Wait for the server to start
 wait_for_server_to_start() {
    timeout=300
    counter=0
    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
        sleep 1
        counter=$((counter + 1))
        if [ $counter -ge $timeout ]; then
            echo "Timeout after $timeout seconds"
            break
        fi
    done
 }
 wait_for_server_to_start
 # Test a simple prompt
 curl -X POST -H "Content-Type: application/json" \
    localhost:8000/generate \
    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@ -1,46 +0,0 @@
 #!/bin/bash
 set -ex
 # Get release version and strip leading 'v' if present
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
 if [ -z "$RELEASE_VERSION" ]; then
  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
  exit 1
 fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel:
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 To download and upload the image:
 \`\`\`
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
 EOF 
--- a/.buildkite/scripts/ci-clean-log.sh
+++ b/.buildkite/scripts/ci-clean-log.sh
@ -1,17 +0,0 @@
 #!/bin/bash
 # Usage: ./ci_clean_log.sh ci.log
 # This script strips timestamps and color codes from CI log files.
 # Check if argument is given
 if [ $# -lt 1 ]; then
    echo "Usage: $0 ci.log"
    exit 1
 fi
 INPUT_FILE="$1"
 # Strip timestamps
 sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
 # Strip colorization
 sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -1,120 +0,0 @@
 #!/bin/bash
 set -ex
 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
 # This script uses DockerHub API to list and delete old tags with "nightly-" prefix
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 # Get DockerHub credentials from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi
 if [ -z "$DOCKERHUB_USERNAME" ]; then
    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
    exit 1
 fi
 # Get DockerHub bearer token
 echo "Getting DockerHub bearer token..."
 set +x
 BEARER_TOKEN=$(curl -s -X POST \
    -H "Content-Type: application/json" \
    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
    "https://hub.docker.com/v2/users/login" | jq -r '.token')
 set -x
 if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
    echo "Error: Failed to get DockerHub bearer token"
    exit 1
 fi
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    while true; do
        set +x
        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
        set -x
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
        if [ -z "$tags" ]; then
            break
        fi
        all_tags="$all_tags$tags"$'\n'
        page=$((page + 1))
    done
    # Sort by timestamp (newest first) and extract just the tag names
    echo "$all_tags" | sort -r | cut -d'|' -f2
 }
 delete_tag() {
    local tag_name="$1"
    echo "Deleting tag: $tag_name"
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
    set +x
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
    set -x
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
    else
        echo "Successfully deleted tag: $tag_name"
    fi
 }
 # Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
 echo "Fetching all tags from DockerHub..."
 all_tags=$(get_all_tags)
 if [ -z "$all_tags" ]; then
    echo "No tags found to clean up"
    exit 0
 fi
 # Count total tags
 total_tags=$(echo "$all_tags" | wc -l)
 echo "Found $total_tags tags"
 # Keep only the last 14 builds (including the current one)
 tags_to_keep=14
 tags_to_delete=$((total_tags - tags_to_keep))
 if [ $tags_to_delete -le 0 ]; then
    echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
    exit 0
 fi
 echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
 # Get tags to delete (skip the first $tags_to_keep tags)
 tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
 if [ -z "$tags_to_delete_list" ]; then
    echo "No tags to delete"
    exit 0
 fi
 # Delete old tags
 echo "Deleting old tags..."
 while IFS= read -r tag; do
    if [ -n "$tag" ]; then
        delete_tag "$tag"
        # Add a small delay to avoid rate limiting
        sleep 1
    fi
 done <<< "$tags_to_delete_list"
 echo "Cleanup completed successfully"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -1,231 +0,0 @@
 #!/bin/bash
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
 # Export Python path
 export PYTHONPATH=".."
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- ROCm info"
 rocminfo
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 # Call the cleanup docker function
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 remove_docker_container() {
   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
 fi
 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
 if [[ $commands == *"pytest -v -s lora"* ]]; then
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
 fi
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
  --ignore=kernels/core/test_fused_quant_layernorm.py \
  --ignore=kernels/core/test_permute_cols.py"
 fi
 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
  --ignore=kernels/attention/test_cascade_flash_attn.py \
  --ignore=kernels/attention/test_mha_attn.py \
  --ignore=kernels/attention/test_lightning_attn.py \
  --ignore=kernels/attention/test_attention.py"
 fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \
  --ignore=kernels/quantization/test_marlin_gemm.py \
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
  --ignore=kernels/quantization/test_int8_kernel.py"
 fi
 if [[ $commands == *" kernels/mamba"* ]]; then
  commands="${commands} \
  --ignore=kernels/mamba/test_mamba_mixer2.py \
  --ignore=kernels/mamba/test_causal_conv1d.py \
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 fi
 if [[ $commands == *" kernels/moe"* ]]; then
  commands="${commands} \
  --ignore=kernels/moe/test_moe.py \
  --ignore=kernels/moe/test_cutlass_moe.py \
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
  --ignore=entrypoints/openai/test_root_path.py \
  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" entrypoints/llm "* ]]; then
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
        --network=host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
        -e AWS_ACCESS_KEY_ID \
        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
        -e "PYTHONPATH=${MYPYTHONPATH}" \
        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done
  #wait for all processes to finish and collect exit codes
  for pid in "${PIDS[@]}"; do
    wait "${pid}"
    STATUS+=($?)
  done
  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]]; then
      echo "One of the processes failed with $st"
      exit "${st}"
    fi
  done
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
          --network=host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
          -e AWS_ACCESS_KEY_ID \
          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
          -e "PYTHONPATH=${MYPYTHONPATH}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -1,49 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Setup cleanup
 remove_docker_container() {
  if [[ -n "$container_id" ]]; then
      podman stop --all -t0
      podman rm -f "$container_id" || true
  fi
  podman system prune -f
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 # Run the image
 container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
 function cpu_tests() {
  # offline inference
  podman exec -it "$container_id" bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
  podman exec -it "$container_id" bash -c "
    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export container_id
 export -f cpu_tests
 timeout 40m bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 docker build -t cpu-test -f docker/Dockerfile.s390x .
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -1,119 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
 # used for TP/PP E2E test
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 export CMAKE_BUILD_PARALLEL_LEVEL=32
 # Setup cleanup
 remove_docker_container() {
    set -e;
    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
  # list packages
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    pip list"
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pip list"
  # offline inference
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -v -s tests/kernels/test_onednn.py"
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
    pytest -x -v -s tests/models/language/generation -m cpu_model
    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
  #   VLLM_USE_V1=0 pytest -x -s -v \
  #   tests/quantization/test_ipex_quant.py"
  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/lora/test_qwen2vl.py"
  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
  # online serving: tp+dp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
 timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -1,29 +0,0 @@
 #!/bin/bash
 # This script build the GH200 docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
 python3 use_existing_torch.py
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX"
 # Setup cleanup
 remove_docker_container() { docker rm -f gh200-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -1,56 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -exuo pipefail
 # Try building the docker image
 cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
 FROM gaudi-base-image:latest
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 RUN VLLM_TARGET_DEVICE=empty pip install .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 WORKDIR /workspace/
 RUN git clone https://github.com/vllm-project/vllm-gaudi.git
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 EOF
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
 # override the exit code of the script, so we need to use
 # separate remove_docker_containers and remove_docker_containers_and_exit
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
 remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers
 echo "Running HPU plugin v1 test"
 docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
  hpu-plugin-v1-test-env \
  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
  echo "Test with basic model passed"
 else
  echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
 fi
 # The trap will handle the container removal and final exit.
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -1,191 +0,0 @@
 #!/bin/bash
 # This script build the Ascend NPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Base ubuntu image with basic ascend development libraries and python installed
 VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
 CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
 TEST_RUN_CONFIG_FILE="vllm_test.cfg"
 VLLM_ASCEND_TMP_DIR=
 # Get the test run configuration file from the vllm-ascend repository
 fetch_vllm_test_cfg() {
    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
    cleanup() {
        rm -rf "${VLLM_ASCEND_TMP_DIR}"
    }
    trap cleanup EXIT
    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
        exit 1
    fi
    # If the file already exists locally, just overwrite it
    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
    rm -rf "${VLLM_ASCEND_TMP_DIR}"
    trap - EXIT
 }
 # Downloads test run configuration file from a remote URL.
 # Loads the configuration into the current script environment.
 get_config() {
    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
 }
 # get test running configuration.
 fetch_vllm_test_cfg
 get_config
 # Check if the function call was successful. If not, exit the script.
 if [ $? -ne 0 ]; then
  exit 1
 fi
 image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
 container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
 agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
 mkdir -p ${builder_cache_dir}
 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
    apt-get update -y && \
    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*
 # Install for pytest to make the docker build cache layer always valid
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install pytest>=6.0  modelscope
 WORKDIR /workspace/vllm
 # Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements/common.txt
 COPY . .
 # Install vLLM
 RUN --mount=type=cache,target=/root/.cache/pip \
    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton
 # Install vllm-ascend
 WORKDIR /workspace
 ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
 ARG VLLM_ASCEND_TAG=main
 RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
 # Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r /workspace/vllm-ascend/requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 ENV VLLM_USE_MODELSCOPE=True
 WORKDIR /workspace/vllm-ascend
 CMD ["/bin/bash"]
 EOF
 # Setup cleanup
 remove_docker_container() {
  docker rm -f "${container_name}" || true;
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
 #   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
        index="${BASH_REMATCH[1]}"
        cards_num="${BASH_REMATCH[2]}"
    else
        echo "parse error" >&2
        return 1
    fi
    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
        devices="$devices --device /dev/davinci${dev_idx}"
        ((i++))
    done
    # trim leading space
    devices="${devices#"${devices%%[![:space:]]*}"}"
    # Output devices: assigned to the caller variable
    printf '%s' "$devices"
 }
 devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
 mkdir -p ${model_cache_dir}
 docker run \
    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
    bash -c '
    set -e
    pytest -v -s tests/e2e/vllm_interface/
 '
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -1,166 +0,0 @@
 #!/bin/bash
 set -xu
 remove_docker_container() { 
    docker rm -f tpu-test || true;
 }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 cleanup_docker
 # For HF_TOKEN.
 source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c '
 set -e # Exit immediately if a command exits with a non-zero status.
 set -u # Treat unset variables as an error.
 echo "--- Starting script inside Docker container ---"
 # Create results directory
 RESULTS_DIR=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $RESULTS_DIR"
 # Install dependencies
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "--- Hardware Information ---"
 # tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
 # --- Test Definitions ---
 # If a test fails, this function will print logs and will not cause the main script to exit.
 run_test() {
    local test_num=$1
    local test_name=$2
    local test_command=$3
    local log_file="$RESULTS_DIR/test_${test_num}.log"
    local actual_exit_code
    echo "--- TEST_$test_num: Running $test_name ---"
    # Execute the test command.
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
    actual_exit_code=$?
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
    if [ "$actual_exit_code" -ne 0 ]; then
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
        if [ -f "$log_file" ]; then
            cat "$log_file" >&2
        else
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
        fi
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
        return "$actual_exit_code" # Return the failure code
    else
        echo "TEST_$test_num ($test_name) PASSED."
        return 0 # Return success
    fi
 }
 # Helper function to call run_test and update the overall script exit code
 run_and_track_test() {
    local test_num_arg="$1"
    local test_name_arg="$2"
    local test_command_arg="$3"
    # Run the test
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
    local test_specific_exit_code=$?
    # If the test failed, set the overall script exit code to 1
    if [ "$test_specific_exit_code" -ne 0 ]; then
        # No need for extra echo here, run_test already logged the failure.
        overall_script_exit_code=1
    fi
 }
 # --- Actual Test Execution ---
 run_and_track_test 1 "test_struct_output_generate.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 2 "test_moe_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 3 "test_lora.py" \
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 run_and_track_test 4 "test_tpu_qkv_linear.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 run_and_track_test 5 "test_spmd_model_weight_loading.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 run_and_track_test 6 "test_kv_cache_update_kernel.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 run_and_track_test 7 "test_tpu_int8.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 else
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 fi
 exit "$overall_script_exit_code"
 ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 # Capture the exit code of the docker run command
 DOCKER_RUN_EXIT_CODE=$?
 # The trap will run for cleanup.
 # Exit the main script with the Docker run command's exit code.
 if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
    exit "$DOCKER_RUN_EXIT_CODE"
 else
    echo "Docker run command completed successfully."
    exit 0
 fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,174 +0,0 @@
 #!/bin/bash
 set -xu
 remove_docker_container() { 
    docker rm -f tpu-test || true; 
 }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 cleanup_docker
 # For HF_TOKEN.
 source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c '
 set -e # Exit immediately if a command exits with a non-zero status.
 set -u # Treat unset variables as an error.
 echo "--- Starting script inside Docker container ---"
 # Create results directory
 RESULTS_DIR=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $RESULTS_DIR"
 # Install dependencies
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "--- Hardware Information ---"
 # tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
 # --- Test Definitions ---
 # If a test fails, this function will print logs and will not cause the main script to exit.
 run_test() {
    local test_num=$1
    local test_name=$2
    local test_command=$3
    local log_file="$RESULTS_DIR/test_${test_num}.log"
    local actual_exit_code
    echo "--- TEST_$test_num: Running $test_name ---"
    # Execute the test command.
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
    actual_exit_code=$?
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
    if [ "$actual_exit_code" -ne 0 ]; then
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
        if [ -f "$log_file" ]; then
            cat "$log_file" >&2
        else
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
        fi
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
        return "$actual_exit_code" # Return the failure code
    else
        echo "TEST_$test_num ($test_name) PASSED."
        return 0 # Return success
    fi
 }
 # Helper function to call run_test and update the overall script exit code
 run_and_track_test() {
    local test_num_arg="$1"
    local test_name_arg="$2"
    local test_command_arg="$3"
    # Run the test
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
    local test_specific_exit_code=$?
    # If the test failed, set the overall script exit code to 1
    if [ "$test_specific_exit_code" -ne 0 ]; then
        # No need for extra echo here, run_test already logged the failure.
        overall_script_exit_code=1
    fi
 }
 # --- Actual Test Execution ---
 run_and_track_test 0 "test_perf.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
 run_and_track_test 1 "test_compilation.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
 run_and_track_test 2 "test_basic.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 run_and_track_test 4 "test_quantization_accuracy.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 run_and_track_test 5 "examples/offline_inference/tpu.py" \
    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
 run_and_track_test 6 "test_tpu_model_runner.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
 run_and_track_test 7 "test_sampler.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
 run_and_track_test 8 "test_topk_topp_sampler.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
 run_and_track_test 9 "test_multimodal.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 run_and_track_test 10 "test_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 else
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 fi
 exit "$overall_script_exit_code"
 ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 # Capture the exit code of the docker run command
 DOCKER_RUN_EXIT_CODE=$?
 # The trap will run for cleanup.
 # Exit the main script with the Docker run command's exit code.
 if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
    exit "$DOCKER_RUN_EXIT_CODE"
 else
    echo "Docker run command completed successfully."
    exit 0
 fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -1,49 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
 docker build -t ${image_name} -f docker/Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() {
  docker rm -f "${container_name}" || true;
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 # Run the image and test offline inference/tensor parallel
 docker run \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    -e "HF_TOKEN=${HF_TOKEN}" \
    -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
    --name "${container_name}" \
    "${image_name}" \
    bash -c '
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/rerun-test.sh
+++ b/.buildkite/scripts/rerun-test.sh
@ -1,18 +0,0 @@
 #!/bin/bash
 # Usage: ./rerun_test.sh path/to/test.py::test_name
 # Check if argument is given
 if [ $# -lt 1 ]; then
    echo "Usage: $0 path/to/test.py::test_name"
    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
    exit 1
 fi
 TEST=$1
 COUNT=1
 while pytest -sv "$TEST"; do
    COUNT=$((COUNT + 1))
    echo "RUN NUMBER ${COUNT}"
 done
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -1,108 +0,0 @@
 #!/bin/bash
 set -euox pipefail
 if [[ $# -lt 4 ]]; then
    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi
 WORKING_DIR=$1
 NUM_NODES=$2
 NUM_GPUS=$3
 DOCKER_IMAGE=$4
 shift 4
 COMMANDS=("$@")
 if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
    exit 1
 fi
 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
    echo "$command"
 done
 start_network() {
    docker network create --subnet=192.168.10.0/24 docker-net
 }
 start_nodes() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        # start the container in detached mode
        # things to note:
        # 1. --shm-size=10.24gb is required. don't use --ipc=host
        # 2. pass HF_TOKEN to the container
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
            /bin/bash -c "tail -f /dev/null"
        # organize containers into a ray cluster
        if [ "$node" -eq 0 ]; then
            # start the ray head node
            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
        fi
    done
    # wait for the cluster to be ready
    sleep 10
    # print the cluster status
    docker exec node0 /bin/bash -c "ray status"
 }
 run_nodes() {
    # important: iterate in reverse order to start the head node last
    # we start the worker nodes first, in detached mode, and then start the head node
    # in the foreground, so that the output of the head node is visible in the buildkite logs
    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
        if [ "$node" -ne 0 ]; then
            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        else
            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        fi
    done
 }
 cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
        docker stop "node$node"
    done
    docker network rm docker-net
 }
 trap cleanup EXIT
 start_network
 start_nodes
 run_nodes
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -1,59 +0,0 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Setup script for Prime-RL integration tests
 # This script prepares the environment for running Prime-RL tests with nightly vLLM
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 echo "Setting up Prime-RL integration test environment..."
 # Clean up any existing Prime-RL directory
 if [ -d "${PRIME_RL_DIR}" ]; then
    echo "Removing existing Prime-RL directory..."
    rm -rf "${PRIME_RL_DIR}"
 fi
 # Install UV if not available
 if ! command -v uv &> /dev/null; then
    echo "Installing UV package manager..."
    curl -LsSf https://astral.sh/uv/install.sh | sh
    source $HOME/.local/bin/env
 fi
 # Clone Prime-RL repository at specific branch for reproducible tests
 PRIME_RL_BRANCH="integ-vllm-main"
 echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
 git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
 cd "${PRIME_RL_DIR}"
 echo "Setting up UV project environment..."
 export UV_PROJECT_ENVIRONMENT=/usr/local
 ln -s /usr/bin/python3 /usr/local/bin/python
 # Remove vllm pin from pyproject.toml
 echo "Removing vllm pin from pyproject.toml..."
 sed -i '/vllm==/d' pyproject.toml
 # Sync Prime-RL dependencies
 echo "Installing Prime-RL dependencies..."
 uv sync --inexact && uv sync --inexact --all-extras
 # Verify installation
 echo "Verifying installations..."
 uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
 uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
 echo "Prime-RL integration test environment setup complete!"
 echo "Running Prime-RL integration tests..."
 export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
 uv run pytest -vs tests/integration/test_rl.py -m gpu
 echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@ -1,24 +0,0 @@
 #!/bin/bash
 set -euo pipefail
 docker_root=$(docker info -f '{{.DockerRootDir}}')
 if [ -z "$docker_root" ]; then
  echo "Failed to determine Docker root directory."
  exit 1
 fi
 echo "Docker root directory: $docker_root"
 # Check disk usage of the filesystem where Docker's root directory is located
 disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 # Define the threshold
 threshold=70
 if [ "$disk_usage" -gt "$threshold" ]; then
  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
  # Remove dangling images (those that are not tagged and not used by any container)
  docker image prune -f
  # Remove unused volumes / force the system prune for old images as well.
  docker volume prune -f && docker system prune --force --filter "until=24h" --all
  echo "Docker images and volumes cleanup completed."
 else
  echo "Disk usage is below $threshold%. No cleanup needed."
 fi
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@ -1,14 +0,0 @@
 # Environment config
 TEST_NAME=llama8b
 CONTAINER_NAME=tpu-test
 # vllm config
 MODEL=meta-llama/Llama-3.1-8B-Instruct
 MAX_NUM_SEQS=256
 MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
 EXPECTED_THROUGHPUT=8.0
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@ -1,90 +0,0 @@
 #!/bin/bash
 if [ ! -f "$1" ]; then
  echo "Error: The env file '$1' does not exist."
  exit 1  # Exit the script with a non-zero status to indicate an error
 fi
 ENV_FILE=$1
 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
 source $ENV_FILE
 remove_docker_container() { 
    docker rm -f $CONTAINER_NAME || true;
 }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 LOG_ROOT=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $LOG_ROOT"
 if [ -z "$HF_TOKEN" ]; then
  echo "Error: HF_TOKEN is not set or is empty."  
  exit 1
 fi
 # Make sure mounted disk or dir exists
 if [ ! -d "$DOWNLOAD_DIR" ]; then
    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
    exit 1
 fi
 echo "Run model $MODEL"
 echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
 -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
 --env-file $ENV_FILE \
 -e HF_TOKEN="$HF_TOKEN" \
 -e TARGET_COMMIT=$BUILDKITE_COMMIT \
 -e MODEL=$MODEL \
 -e WORKSPACE=/workspace \
 --name $CONTAINER_NAME \
 -d \
 --privileged \
 --network host \
 -v /dev/shm:/dev/shm \
 vllm/vllm-tpu-bm tail -f /dev/null
 echo "run script..."
 echo
 docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
 echo "copy result back..."
 VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
 BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
 docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
 docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
 throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
 if [ "$BUILDKITE" = "true" ]; then
  echo "Running inside Buildkite"
  buildkite-agent artifact upload "$VLLM_LOG" 
  buildkite-agent artifact upload "$BM_LOG"
 else
  echo "Not running inside Buildkite"
 fi
 #
 # compare the throughput with EXPECTED_THROUGHPUT 
 # and assert meeting the expectation
 # 
 if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
  echo "Failed to get the throughput"
  exit 1
 fi
 if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
  exit 1
 fi
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@ -1,14 +0,0 @@
 # Environment config
 TEST_NAME=llama8bw8a8
 CONTAINER_NAME=tpu-test
 # vllm config
 MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
 MAX_NUM_SEQS=128
 MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
 EXPECTED_THROUGHPUT=8.7
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -1,93 +0,0 @@
 #!/bin/bash
 set -euo pipefail
 VLLM_LOG="$WORKSPACE/vllm_log.txt"
 BM_LOG="$WORKSPACE/bm_log.txt"
 if [ -n "$TARGET_COMMIT" ]; then
  head_hash=$(git rev-parse HEAD)
  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
    exit 1
  fi
 fi
 echo "model: $MODEL"
 echo
 #
 # create a log folder
 #
 mkdir "$WORKSPACE/log"
 # TODO: Move to image building.
 pip install pandas
 pip install datasets
 #
 # create sonnet_4x
 #
 echo "Create sonnet_4x.txt"
 echo "" > benchmarks/sonnet_4x.txt
 for _ in {1..4}
 do
  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
 done
 #
 # start vllm service in backend
 #
 echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
 vllm serve $MODEL \
 --seed 42 \
 --max-num-seqs $MAX_NUM_SEQS \
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 --no-enable-prefix-caching \
 --download_dir $DOWNLOAD_DIR \
 --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
 for i in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
        exit 1
    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
        echo "Application started"
        break
    else
        echo "wait for 10 seconds..."
        sleep 10
    fi
 done
 #
 # run test
 #
 echo "run benchmark test..."
 echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
    --model $MODEL  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
    --sonnet-input-len $INPUT_LEN \
    --sonnet-output-len $OUTPUT_LEN \
    --ignore-eos > "$BM_LOG"
 echo "completed..."
 echo
 throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 echo "throughput: $throughput"
 echo
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -1,91 +0,0 @@
 #!/usr/bin/env bash
 set -ex
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 # Check that exactly one wheel is found
 if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
 # Detect architecture and rename 'linux' to appropriate manylinux version
 arch=$(uname -m)
 if [[ $arch == "x86_64" ]]; then
    manylinux_version="manylinux1"
 elif [[ $arch == "aarch64" ]]; then
    manylinux_version="manylinux2014"
 else
    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
    manylinux_version="manylinux1"
 fi
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
 echo "Version: $version"
 normal_wheel="$wheel" # Save the original wheel filename
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
    suffix="${version##*.}"
    if [[ $suffix == cu* ]]; then
        new_version="1.0.0.dev+${suffix}"
    else
        new_version="1.0.0.dev"
    fi
    new_wheel="${wheel/$version/$new_version}"
    # use cp to keep both files in the artifacts directory
    cp -- "$wheel" "$new_wheel"
    wheel="$new_wheel"
    version="$new_version"
 fi
 # Upload the wheel to S3
 python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 # generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@ -0,0 +1,59 @@
 {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
 {% set default_working_dir = "/vllm-workspace/tests" %}
 steps:
  - label: ":docker: build image"
    agents:
      queue: cpu_queue
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
      - "docker push {{ docker_image }}"
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 5
        - exit_status: -10  # Agent was lost
          limit: 5
  - wait
  {% for step in steps %}
  - label: "{{ step.label }}"
    agents:
      {% if step.no_gpu %}
      queue: cpu_queue
      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
      queue: gpu_4_queue
      {% else %}
      queue: gpu_1_queue
      {% endif %}
    soft_fail: true
    {% if step.parallelism %}
    parallelism: {{ step.parallelism }}
    {% endif %}
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 5
        - exit_status: -10  # Agent was lost
          limit: 5
    plugins:
      - docker#v5.2.0:
          image: {{ docker_image }}
          always-pull: true
          propagate-environment: true
          {% if not step.no_gpu %}
          gpus: all
          {% endif %}
          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
          environment:
            - VLLM_USAGE_SOURCE=ci-test
            - HF_TOKEN
            {% if step.label == "Speculative decoding tests" %}
            - VLLM_ATTENTION_BACKEND=XFORMERS
            {% endif %}
          volumes:
            - /dev/shm:/dev/shm
  {% endfor %}
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -0,0 +1,95 @@
 {% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
 {% set default_num_gpu = 1 %}
 {% set default_working_dir = "/vllm-workspace/tests" %}
 steps:
  - label: ":docker: build image"
    commands: 
      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
      - "docker push {{ docker_image }}"
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 5
        - exit_status: -10  # Agent was lost
          limit: 5
  - wait
  - group: "AMD Tests"
    depends_on: ~
    steps:
    {% for step in steps %}
    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
      - label: "AMD: {{ step.label }}"
        agents:
          queue: amd
        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
        env:
          DOCKER_BUILDKIT: "1"
    {% endif %}
    {% endfor %}
  - label: "Neuron Test"
    depends_on: ~
    agents:
      queue: neuron
    command: bash .buildkite/run-neuron-test.sh
    soft_fail: true
  - label: "Intel Test"
    depends_on: ~
    agents:
      queue: intel
    command: bash .buildkite/run-cpu-test.sh
  {% for step in steps %}
  - label: "{{ step.label }}"
    agents:
      queue: kubernetes
    soft_fail: {{ step.soft_fail or false }}
    {% if step.parallelism %}
    parallelism: {{ step.parallelism }}
    {% endif %}
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 5
        - exit_status: -10  # Agent was lost
          limit: 5
    plugins:
      - kubernetes:
          podSpec:
            {% if step.num_gpus %}
            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
            {% endif %}
            volumes:
              - name: dshm
                emptyDir:
                  medium: Memory
            containers:
              - image: "{{ docker_image }}"
                command: ["bash"]
                args:
                - '-c'
                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
                {% if not step.no_gpu %}
                resources:
                  requests:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                  limits:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                {% endif %}
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                volumeMounts:
                  - mountPath: /dev/shm
                    name: dshm
  {% endfor %}
--- a/.coveragerc
+++ b/.coveragerc
@ -1,32 +0,0 @@
 [run]
 source = vllm
 omit =
    */tests/*
    */test_*
    */__pycache__/*
    */build/*
    */dist/*
    */vllm.egg-info/*
    */third_party/*
    */examples/*
    */benchmarks/*
    */docs/*
 [report]
 exclude_lines =
    pragma: no cover
    def __repr__
    if self.debug:
    if settings.DEBUG
    raise AssertionError
    raise NotImplementedError
    if 0:
    if __name__ == .__main__.:
    class .*\bProtocol\):
    @(abc\.)?abstractmethod
 [html]
 directory = htmlcov
 [xml]
 output = coverage.xml
--- a/.dockerignore
+++ b/.dockerignore
@ -1,33 +1 @@
 /.venv
 /build
 dist
 vllm/*.so
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 .mypy_cache
 # Distribution / packaging
 .Python
 /build/
 cmake-build-*/
 CMakeUserPresets.json
 develop-eggs/
 /dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
--- a/.gemini/config.yaml
+++ b/.gemini/config.yaml
@ -1,6 +0,0 @@
 # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
 have_fun: false  # Just review the code
 code_review:
  comment_severity_threshold: HIGH  # Reduce quantity of comments
  pull_request_opened:
    summary: false  # Don't summarize the PR in a separate comment
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@ -1,24 +0,0 @@
 # doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
 version: 1
 paths:
 # We temporarily disable globally, and will only enable with `annotations.include`
 # include:
 #   - "vllm/v1/attetion/*.py"
 #   - "vllm/v1/core/*.py"
 exclude:
  - "**/*.py"
 scan:
  functions: true        # check free functions and methods
  classes: true          # check classes/dataclasses
  public_only: true      # ignore names starting with "_" at any level
 annotations:
  include:               # decorators that force‑include a symbol
    - name: "bc_linter_include"  # matched by simple name or dotted suffix
      propagate_to_members: false # for classes, include methods/inner classes
  exclude:               # decorators that force‑exclude a symbol
    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
      propagate_to_members: true  # for classes, exclude methods/inner classes
 excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1,123 +0,0 @@
 # See https://help.github.com/articles/about-codeowners/
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
 /tests/evals @mgoin
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
 # Transformers backend
 /vllm/model_executor/models/transformers.py @hmellor
 /tests/models/test_transformers.py @hmellor
 # Docs
 /docs/mkdocs @hmellor
 /docs/**/*.yml @hmellor
 /requirements/docs.txt @hmellor
 .readthedocs.yaml @hmellor
 mkdocs.yaml @hmellor
 # Linting
 .markdownlint.yaml @hmellor
 .pre-commit-config.yaml @hmellor
 /tools/pre_commit @hmellor
 # CPU
 /vllm/v1/worker/cpu* @bigPYJ1151
 /csrc/cpu @bigPYJ1151
 /vllm/platforms/cpu.py @bigPYJ1151
 /cmake/cpu_extension.cmake @bigPYJ1151
 /docker/Dockerfile.cpu @bigPYJ1151
 # Intel GPU
 /vllm/v1/worker/xpu* @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang
 # Qwen-specific files
 /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 /vllm/model_executor/models/qwen* @sighingnow
 # MTP-specific files
 /vllm/model_executor/models/deepseek_mtp.py @luccafong
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 # Kernels
 /vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
 /docker/Dockerfile.rocm* @gshtras
 /vllm/v1/attention/backends/rocm*.py @gshtras
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
 # TPU
 /vllm/v1/worker/tpu* @NickLucche
 /vllm/platforms/tpu.py @NickLucche
 /vllm/v1/sample/tpu @NickLucche
 /vllm/tests/v1/tpu @NickLucche
 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -1,2 +0,0 @@
 github: [vllm-project]
 open_collective: vllm
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@ -20,10 +20,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
@ -38,10 +38,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
@ -36,10 +36,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Zhuohan Li	d5bf492f16	Merge branch 'main' into optimize-prefix-caching-scheduling	2024-06-04 00:20:15 +00:00
Zhuohan Li	8c7bab79f5	simplify code	2024-06-03 03:36:38 +00:00
Zhuohan Li	1936d7bab0	format	2024-06-02 00:02:54 +00:00
Zhuohan Li	996cf2de5c	Fix hashing logic for non-full blocks	2024-06-02 00:01:30 +00:00
		`@ -1,2 +0,0 @@`
			`github: [vllm-project]`
			`open_collective: vllm`