initial

Signed-off-by: Roger Wang <ywang@roblox.com>
[Misc] fix typo: add missing space in lora adapter error message (#12564 )
2025-10-20 23:03:52 +08:00 · 2025-01-31 00:55:49 +00:00 · 2025-01-30 15:39:22 +00:00 · 2025-01-30 00:05:42 -08:00 · 2025-01-29 18:31:01 -08:00 · 2025-01-30 00:47:30 +00:00
1665 changed files with 275722 additions and 41311 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,36 +1,46 @@
 import os
+import sys
 import zipfile

-MAX_SIZE_MB = 200
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Note that we have 400 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/3792 .
+# Please also sync the value with the one in Dockerfile.
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))


 def print_top_10_largest_files(zip_file):
+    """Print the top 10 largest files in the given zip file."""
    with zipfile.ZipFile(zip_file, 'r') as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
-            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")


 def check_wheel_size(directory):
+    """Check the size of .whl files in the given directory."""
    for root, _, files in os.walk(directory):
-        for f in files:
-            if f.endswith(".whl"):
-                wheel_path = os.path.join(root, f)
-                wheel_size = os.path.getsize(wheel_path)
-                wheel_size_mb = wheel_size / (1024 * 1024)
-                if wheel_size_mb > MAX_SIZE_MB:
-                    print(
-                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
-                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+        for file_name in files:
+            if file_name.endswith(".whl"):
+                wheel_path = os.path.join(root, file_name)
+                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
+                if wheel_size_mb > VLLM_MAX_SIZE_MB:
+                    print(f"Not allowed: Wheel {wheel_path} is larger "
+                          f"({wheel_size_mb:.2f} MB) than the limit "
+                          f"({VLLM_MAX_SIZE_MB} MB).")
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
                    print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb} MB).")
+                          f"({wheel_size_mb:.2f} MB).")
    return 0


 if __name__ == "__main__":
-    import sys
-    sys.exit(check_wheel_size(sys.argv[1]))
+    if len(sys.argv) < 2:
+        print("Usage: python check-wheel-size.py <directory>")
+        sys.exit(1)
+
+    directory = sys.argv[1]
+    sys.exit(check_wheel_size(directory))
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-set -ex
-set -o pipefail
-
-(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-
-# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
-mkdir -p images
-cd images
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
-
-cd -
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -0,0 +1,24 @@
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(wheel=filename,
+                        wheel_html_escaped=filename.replace("+", "%2B")))
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -0,0 +1,12 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
+model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.671
+  - name: "exact_match,flexible-extract"
+    value: 0.664
+limit: 1000
+num_fewshot: 5
+trust_remote_code: True
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
+model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.905
+  - name: "exact_match,flexible-extract"
+    value: 0.905
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.892
+  - name: "exact_match,flexible-extract"
+    value: 0.892
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.752
+  - name: "exact_match,flexible-extract"
+    value: 0.754
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.753
+  - name: "exact_match,flexible-extract"
+    value: 0.753
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.755
+  - name: "exact_match,flexible-extract"
+    value: 0.755
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.753
+  - name: "exact_match,flexible-extract"
+    value: 0.753
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.764
+  - name: "exact_match,flexible-extract"
+    value: 0.764
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.728
+  - name: "exact_match,flexible-extract"
+    value: 0.728
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.758
+  - name: "exact_match,flexible-extract"
+    value: 0.759
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
+model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.356
+  - name: "exact_match,flexible-extract"
+    value: 0.358
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
+model_name: "mgoin/Minitron-4B-Base-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.233
+  - name: "exact_match,flexible-extract"
+    value: 0.236
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
+model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.86
+  - name: "exact_match,flexible-extract"
+    value: 0.86
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
+model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.624
+  - name: "exact_match,flexible-extract"
+    value: 0.624
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.632
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.578
+  - name: "exact_match,flexible-extract"
+    value: 0.585
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.593
+  - name: "exact_match,flexible-extract"
+    value: 0.588
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.595
+  - name: "exact_match,flexible-extract"
+    value: 0.582
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
+model_name: "Qwen/Qwen2-57B-A14B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.792
+  - name: "exact_match,flexible-extract"
+    value: 0.824
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -0,0 +1,5 @@
+Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
+Qwen2-57B-A14-Instruct.yaml
+DeepSeek-V2-Lite-Chat.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -0,0 +1,10 @@
+Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+Minitron-4B-Base-FP8.yaml
+Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-FP8W8.yaml
+Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.4
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args "pretrained=$MODEL,parallelize=True" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.4
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm and compares to "
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
+    echo "  -t    - tensor parallel size"
+    echo
+}
+
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
+
+    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
+    export LM_EVAL_TP_SIZE=$TP_SIZE
+    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -0,0 +1,63 @@
+"""
+LM eval harness on model to compare vs HF baseline computed offline.
+Configs are found in configs/$MODEL.yaml
+
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+* export LM_EVAL_TP_SIZE=4 
+* pytest -s test_lm_eval_correctness.py
+"""
+
+import os
+from pathlib import Path
+
+import lm_eval
+import numpy
+import yaml
+
+RTOL = 0.05
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
+
+TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
+
+
+def launch_lm_eval(eval_config):
+    trust_remote_code = eval_config.get('trust_remote_code', False)
+
+    model_args = f"pretrained={eval_config['model_name']}," \
+                 f"tensor_parallel_size={TP_SIZE}," \
+                 f"add_bos_token=true," \
+                 f"trust_remote_code={trust_remote_code}"
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto")
+
+    return results
+
+
+def test_lm_eval_correctness():
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+
+    # Launch eval requests.
+    results = launch_lm_eval(eval_config)
+
+    # Confirm scores match ground truth.
+    success = True
+    for task in eval_config["tasks"]:
+        for metric in task["metrics"]:
+            ground_truth = metric["value"]
+            measured_value = results["results"][task["name"]][metric["name"]]
+            print(f'{task["name"]} | {metric["name"]}: '
+                  f'ground_truth={ground_truth} | measured={measured_value}')
+            success = success and numpy.isclose(
+                ground_truth, measured_value, rtol=RTOL)
+
+    # Assert at the end, print all scores even on failure for debugging.
+    assert success
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -0,0 +1,153 @@
+# vLLM benchmark suite
+
+
+## Introduction
+
+This directory contains two sets of benchmark for vllm.
+- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
+- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
+
+
+See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+
+
+## Performance benchmark quick overview
+
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
+
+**Benchmarking Duration**: about 1hr.
+
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
+
+
+## Nightly benchmark quick overview
+
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 
+
+**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
+
+**Benchmarking Duration**: about 3.5hrs.
+
+
+
+## Trigger the benchmark
+
+Performance benchmark will be triggered when:
+- A PR being merged into vllm.
+- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
+
+Nightly benchmark will be triggered when:
+- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
+
+
+
+
+## Performance benchmark details
+
+
+See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+
+
+#### Latency test
+
+Here is an example of one test inside `latency-tests.json`:
+
+```json
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+]
+```
+
+In this example:
+-  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+
+Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
+
+
+#### Throughput test
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+
+The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
+
+#### Serving test
+We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+
+```
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+]
+```
+
+Inside this example:
+- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
+- The `server-parameters` includes the command line arguments for vLLM server.
+- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+
+The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+
+#### Visualizing the results
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+
+
+## Nightly test details
+
+See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
+
+
+#### Workflow
+
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
+- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
+- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
+- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+
+#### Nightly tests
+
+In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
+
+#### Docker containers
+
+The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
+
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+
+WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
+
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -0,0 +1,92 @@
+steps:
+  - label: "Wait for container to be ready"
+    key: wait-for-container-image
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: badouralix/curl-jq
+            command:
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: wait-for-container-image
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -0,0 +1,28 @@
+
+## Description
+
+This file contains the downloading link for benchmarking results.
+
+- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
+- [benchmarking results](artifact://results.zip)
+- [benchmarking code](artifact://nightly-benchmarks.zip)
+
+Please download the visualization scripts in the post
+
+
+## Results reproduction
+
+- Find the docker we use in `benchmarking pipeline`
+- Deploy the docker, and inside the docker:
+  - Download `nightly-benchmarks.zip`. 
+  - In the same folder, run the following code
+```
+export HF_TOKEN=<your HF token>
+apt update
+apt install -y git
+unzip nightly-benchmarks.zip
+VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+```
+
+And the results will be inside `./benchmarks/results`.
+
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -0,0 +1,39 @@
+
+# Nightly benchmark
+
+This benchmark aims to:
+- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
+- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
+
+Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
+
+Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+
+
+## Setup
+
+- Docker images:
+  - vLLM: `vllm/vllm-openai:v0.6.2`
+  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+- Hardware
+  - 8x Nvidia A100 GPUs
+- Workload:
+  - Dataset
+    - ShareGPT dataset
+    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+  - Models: llama-3 8B, llama-3 70B.
+    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+# Known issues
+
+- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
+- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -0,0 +1,196 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+
+
+
+  - label: "A100 vllm step 10"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:v0.6.2
+                <<: *common_container_settings
+
+
+
+  - label: "A100 sglang benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: lmsysorg/sglang:v0.3.2-cu121
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
+                <<: *common_container_settings
+
+
+
+
+  - label: "A100 trt llama-8B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama8B"
+
+
+  - label: "A100 trt llama-70B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama70B"
+
+
+  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  #               <<: *common_container_settings
+
+
+  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
+  #               <<: *common_container_settings
+        
+  - wait
+
+  - label: "Collect the results"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -0,0 +1,62 @@
+
+## Latency tests
+
+- Input length: 32 tokens.
+- Output length: 128 tokens.
+- Batch size: fixed (8).
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: end-to-end latency (mean, median, p99).
+
+
+{latency_tests_markdown_table}
+
+
+## Throughput tests
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm to achieve maximum throughput.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput.
+
+
+{throughput_tests_markdown_table}
+
+
+## Serving tests
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B, under QPS 2
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+
+
+{serving_tests_markdown_table}
+
+
+## json version of the benchmarking tables
+
+This section contains the data of the markdown tables above in JSON format. 
+You can load the benchmarking tables into pandas dataframes as follows:
+
+```python
+import json
+import pandas as pd
+
+benchmarking_results_json = """The json string"""
+benchmarking_results = json.loads(benchmarking_results_json)
+latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
+throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
+serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
+```
+
+The json string for all benchmarking tables:
+```json
+{benchmarking_results_in_json_string}
+```
+
+You can also check the raw experiment data in the Artifact tab of the Buildkite page.
+
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -0,0 +1,204 @@
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# latency results and the keys that will be printed into markdown
+latency_results = []
+latency_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "avg_latency": "Mean latency (ms)",
+    # "P10": "P10 (s)",
+    # "P25": "P25 (s)",
+    "P50": "Median latency (ms)",
+    # "P75": "P75 (s)",
+    # "P90": "P90 (s)",
+    "P99": "P99 latency (ms)",
+}
+
+# throughput tests and the keys that will be printed into markdown
+throughput_results = []
+throughput_results_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    # "num_requests": "# of req.",
+    # "total_num_tokens": "Total # of tokens",
+    # "elapsed_time": "Elapsed time (s)",
+    "requests_per_second": "Tput (req/s)",
+    # "tokens_per_second": "Tput (tok/s)",
+}
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    # "completed": "# of req.",
+    "request_throughput": "Tput (req/s)",
+    # "input_throughput": "Input Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "p99_ttft_ms": "P99 TTFT (ms)",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
+    # "median_tpot_ms": "Median",
+    # "p99_tpot_ms": "P99",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "p99_itl_ms": "P99 ITL (ms)",
+}
+
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file) as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps({
+        'latency': latency.to_dict(),
+        'throughput': throughput.to_dict(),
+        'serving': serving.to_dict()
+    })
+
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands")) as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands")) as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands")) as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    raw_results_json = results_to_json(latency_results, throughput_results,
+                                       serving_results)
+
+    # remapping the key, for visualization purpose
+    if not latency_results.empty:
+        latency_results = latency_results[list(
+            latency_column_mapping.keys())].rename(
+                columns=latency_column_mapping)
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+    if not throughput_results.empty:
+        throughput_results = throughput_results[list(
+            throughput_results_column_mapping.keys())].rename(
+                columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(latency_results,
+                                             throughput_results,
+                                             serving_results)
+
+    for df in [latency_results, serving_results, throughput_results]:
+        if df.empty:
+            continue
+
+        # Sort all dataframes by their respective "Test name" columns
+        df.sort_values(by="Test name", inplace=True)
+
+        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
+        # we want to turn it into "8xGPUTYPE"
+        df["GPU"] = df["GPU"].apply(
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+
+    # get markdown tables
+    latency_md_table = tabulate(latency_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    throughput_md_table = tabulate(throughput_results,
+                                   headers='keys',
+                                   tablefmt='pipe',
+                                   showindex=False)
+
+    # document the result
+    with open(results_folder / "benchmark_results.md", "w") as f:
+
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
+                                "performance-benchmarks-descriptions.md")
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            benchmarking_results_in_json_string=processed_results_json)
+        f.write(results)
+
+    # document benchmarking results in json
+    with open(results_folder / "benchmark_results.json", "w") as f:
+
+        results = latency_results.to_dict(
+            orient='records') + throughput_results.to_dict(
+                orient='records') + serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -0,0 +1,26 @@
+import argparse
+
+from transformers import AutoTokenizer
+
+
+def main(model, cachedir):
+    # Load the tokenizer and save it to the specified directory
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer.save_pretrained(cachedir)
+    print(f"Tokenizer saved to {cachedir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and save Hugging Face tokenizer")
+    parser.add_argument("--model",
+                        type=str,
+                        required=True,
+                        help="Name of the model")
+    parser.add_argument("--cachedir",
+                        type=str,
+                        required=True,
+                        help="Directory to save the tokenizer")
+
+    args = parser.parse_args()
+    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -0,0 +1,95 @@
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def get_perf(df, method, model, metric):
+
+    means = []
+
+    for qps in [2, 4, 8, 16, "inf"]:
+        target = df['Test name'].str.contains(model)
+        target = target & df['Engine'].str.contains(method)
+        target = target & df['Test name'].str.contains("qps_" + str(qps))
+        filtered_df = df[target]
+
+        if filtered_df.empty:
+            means.append(0.)
+        else:
+            means.append(filtered_df[metric].values[0])
+
+    return np.array(means)
+
+
+def get_perf_w_std(df, method, model, metric):
+
+    if metric in ["TTFT", "ITL"]:
+        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
+        mean = mean.tolist()
+        std = get_perf(df, method, model, "Std " + metric + " (ms)")
+        if std.mean() == 0:
+            std = None
+        success = get_perf(df, method, model, "Successful req.")
+        if std is not None:
+            std = std / np.sqrt(success)
+            std = std.tolist()
+
+    else:
+        assert metric == "Tput"
+        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
+            df, method, model, "Output Tput (tok/s)")
+        mean = mean.tolist()
+        std = None
+
+    return mean, std
+
+
+def main(args):
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file) as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description) as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -0,0 +1,6 @@
+from lmdeploy.serve.openai.api_client import APIClient
+
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -0,0 +1,228 @@
+#!/bin/bash
+
+# Currently FP8 benchmark is NOT enabled.
+
+set -x
+server_params=$1
+common_params=$2
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+launch_trt_server() {
+
+  model_path=$(echo "$common_params" | jq -r '.model')
+  model_name="${model_path#*/}"
+  model_type=$(echo "$server_params" | jq -r '.model_type')
+  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+  model_tp_size=$(echo "$common_params" | jq -r '.tp')
+  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
+  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
+  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+  # create model caching directory
+  cd ~
+  rm -rf models
+  mkdir -p models
+  cd models
+  models_dir=$(pwd)
+  trt_model_path=${models_dir}/${model_name}-trt-ckpt
+  trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+  # clone tensorrt backend
+  cd /
+  rm -rf tensorrtllm_backend
+  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+  git lfs install
+  cd tensorrtllm_backend
+  git checkout "$trt_llm_version"
+  git submodule update --init --recursive
+
+  # build trtllm engine
+  cd /tensorrtllm_backend
+  cd "./tensorrt_llm/examples/${model_type}"
+  python3 convert_checkpoint.py \
+    --model_dir "${model_path}" \
+    --dtype "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --output_dir "${trt_model_path}"
+  trtllm-build \
+    --checkpoint_dir "${trt_model_path}" \
+    --use_fused_mlp \
+    --reduce_fusion disable \
+    --workers 8 \
+    --gpt_attention_plugin "${model_dtype}" \
+    --gemm_plugin "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --max_batch_size "${max_batch_size}" \
+    --max_input_len "${max_input_len}" \
+    --max_seq_len "${max_seq_len}" \
+    --max_num_tokens "${max_num_tokens}" \
+    --output_dir "${trt_engine_path}"
+
+  # handle triton protobuf files and launch triton server
+  cd /tensorrtllm_backend
+  mkdir triton_model_repo
+  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
+  cd triton_model_repo
+  rm -rf ./tensorrt_llm/1/*
+  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
+  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
+  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
+  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
+  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
+  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
+  cd /tensorrtllm_backend
+  python3 scripts/launch_triton_server.py \
+    --world_size="${model_tp_size}" \
+    --model_repo=/tensorrtllm_backend/triton_model_repo &
+
+}
+
+launch_tgi_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                --quantize fp8 \
+                $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                $server_args"
+  fi
+
+  echo "Server command: $server_command"
+  eval "$server_command" &
+
+}
+
+launch_lmdeploy_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  server_command="lmdeploy serve api_server $model \
+    --tp $tp \
+    --server-port $port \
+    $server_args"
+
+  # run the server
+  echo "Server command: $server_command"
+  bash -c "$server_command" &
+}
+
+launch_sglang_server() {
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+launch_vllm_server() {
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+main() {
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
+    launch_trt_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
+    launch_tgi_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+    launch_lmdeploy_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
+    launch_sglang_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
+    launch_vllm_server
+  fi
+}
+
+main
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -0,0 +1,78 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+main() {
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+    (which zip) || (apt-get install -y zip)
+
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip plotting the results."
+        exit 0
+    fi
+
+    # initial annotation
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+    # download results
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    mkdir -p results/
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+    ls
+    ls results/
+
+    # upload benchmark results
+    zip -r results.zip results/
+    /workspace/buildkite-agent artifact upload "results.zip"
+
+    # upload benchmarking scripts
+    cd "$VLLM_SOURCE_CODE_LOC/"
+    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
+    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    # upload benchmarking pipeline
+    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
+    
+
+
+    # The figures should be generated by a separate process outside the CI/CD pipeline
+
+    # # generate figures
+    # python3 -m pip install tabulate pandas matplotlib
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
+    #     --description $description \
+    #     --results-folder results/ 
+
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sharegpt
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_2048_128
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_128_2048
+    
+    # # upload results and figures
+    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -0,0 +1,462 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+get_current_llm_serving_engine() {
+
+  if which lmdeploy >/dev/null; then
+    echo "Container: lmdeploy"
+    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+    return
+  fi
+
+  if [ -e /tgi-entrypoint.sh ]; then
+    echo "Container: tgi"
+    export CURRENT_LLM_SERVING_ENGINE=tgi
+    return
+  fi
+
+  if which trtllm-build >/dev/null; then
+    echo "Container: tensorrt-llm"
+    export CURRENT_LLM_SERVING_ENGINE=trt
+    return
+  fi
+
+  if [ -e /sgl-workspace ]; then
+    echo "Container: sglang"
+    export CURRENT_LLM_SERVING_ENGINE=sglang
+    return
+  fi
+
+  if [ -e /vllm-workspace ]; then
+    echo "Container: vllm"
+    # move to a completely irrelevant directory, to avoid import vllm from current folder
+    export CURRENT_LLM_SERVING_ENGINE=vllm
+    
+    return
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+kill_gpu_processes() {
+  pkill -f python
+  pkill -f python3
+  pkill -f tritonserver
+  pkill -f pt_main_thread
+  pkill -f text-generation
+  pkill -f lmdeploy
+
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+ensure_installed() {
+  # Ensure that the given command is installed by apt-get
+  local cmd=$1
+  if ! which "$cmd" >/dev/null; then
+    apt-get update && apt-get install -y "$cmd"
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    # this is required for lmdeploy.
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+
+
+    # change model name for lmdeploy (it will not follow standard hf name)
+    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      backend=$CURRENT_LLM_SERVING_ENGINE
+
+      if [[ $backend = "trt" ]]; then
+        backend="tensorrt-llm"
+      fi
+
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+
+      if [[ "$dataset_name" = "sharegpt" ]]; then
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      elif [[ "$dataset_name" = "sonnet" ]]; then
+
+        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
+        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
+        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --sonnet-input-len $sonnet_input_len \
+          --sonnet-output-len $sonnet_output_len \
+          --sonnet-prefix-len $sonnet_prefix_len \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      else
+  
+        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
+        exit 1
+
+      fi
+
+        
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command="None"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+  done
+
+  kill_gpu_processes
+}
+
+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
+
+prepare_dataset() {
+
+  # download sharegpt dataset
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  
+}
+
+main() {
+
+  # check if the environment variable is successfully injected from yaml
+
+  check_gpus
+  check_hf_token
+  get_current_llm_serving_engine
+
+  pip install -U transformers
+
+  pip install -r requirements-dev.txt
+  which genai-perf
+
+  # check storage
+  df -h
+
+  ensure_installed wget
+  ensure_installed curl
+  ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d
+
+  prepare_dataset
+
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+
+  # run the test
+  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
+
+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
+  # upload benchmark results to buildkite
+  python3 -m pip install tabulate pandas
+  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -0,0 +1,377 @@
+#!/bin/bash
+
+# This script should be run inside the CI process
+# This script assumes that we are already inside the vllm/ directory
+# Benchmarking results will be available inside vllm/benchmarks/results/
+
+# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
+# and we still want to see other benchmarking results even when mixtral crashes.
+set -x
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+ensure_sharegpt_downloaded() {
+  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
+  if [ ! -f "$FILE" ]; then
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+  else
+    echo "$FILE already exists."
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -X POST localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+kill_processes_launched_by_current_bash() {
+  # Kill all python processes launched from current bash script
+  current_shell_pid=$$
+  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
+  if [ -n "$processes" ]; then
+    echo "Killing the following processes matching '$1':"
+    echo "$processes"
+    echo "$processes" | xargs kill -9
+  else
+    echo "No processes found matching '$1'."
+  fi
+}
+
+kill_gpu_processes() {
+
+  ps -aux
+  lsof -t -i:8000 | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+
+
+  # wait until GPU memory usage smaller than 1GB
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
+}
+
+run_latency_tests() {
+  # run latency tests using `benchmark_latency.py`
+  # $1: a json file specifying latency test cases
+
+  local latency_test_file
+  latency_test_file=$1
+
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    latency_command="python3 benchmark_latency.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $latency_args"
+
+    echo "Running test case $test_name"
+    echo "Latency command: $latency_command"
+
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg latency "$latency_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        latency_command: $latency,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$latency_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_throughput_tests() {
+  # run throughput tests using `benchmark_throughput.py`
+  # $1: a json file specifying throughput test cases
+
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    throughput_command="python3 benchmark_throughput.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg command "$throughput_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        throughput_command: $command,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^serving_ ]]; then
+      echo "In serving-test.json, test_name must start with \"serving_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.server_parameters')
+    client_params=$(echo "$params" | jq -r '.client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
+    client_model=$(echo "$client_params" | jq -r '.model')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $test_name."
+      continue
+    fi
+
+    server_command="python3 \
+      -m vllm.entrypoints.openai.api_server \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+    server_pid=$!
+
+    # wait until the server is alive
+    if wait_for_server; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      bash -c "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill -9 $server_pid
+    kill_gpu_processes
+  done
+}
+
+main() {
+  check_gpus
+  check_hf_token
+
+  # dependencies
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get update && apt-get -y install jq)
+  (which lsof) || (apt-get update && apt-get install -y lsof)
+
+  # get the current IP address, required by benchmark_serving.py
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  # turn of the reporting of the status of each request, to clean up the terminal output
+  export VLLM_LOG_LEVEL="WARNING"
+
+  # prepare for benchmarking
+  cd benchmarks || exit 1
+  ensure_sharegpt_downloaded
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # benchmarking
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
+
+  # postprocess benchmarking results
+  pip install tabulate pandas
+  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+
+  upload_to_buildkite
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -0,0 +1,83 @@
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "std_ttft_ms": "Std TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "std_tpot_ms": "Std TPOT (ms)",
+    "median_tpot_ms": "Median TPOT (ms)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
+    "engine": "Engine",
+}
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands")) as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+
+    serving_md_table_with_headers = tabulate(serving_results,
+                                             headers='keys',
+                                             tablefmt='pipe',
+                                             showindex=False)
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
+        f.write('\n')
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+
+        results = serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -0,0 +1,19 @@
+#!/bin/sh
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
+
+TIMEOUT_SECONDS=10
+
+retries=0
+while [ $retries -lt 1000 ]; do
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
+        exit 0
+    fi
+
+    echo "Waiting for image to be available..."
+
+    retries=$((retries + 1))
+    sleep 5
+done
+
+exit 1
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@ -0,0 +1,23 @@
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "num_prompts": 500,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    }
+]
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -0,0 +1,323 @@
+[
+    {
+        "test_name": "llama8B_tp1_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000,
+            "reuse_server": false
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000,
+            "reuse_server": false
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    }
+]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -0,0 +1,80 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
+        "qps_list": [2],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "disable_log_requests": "", 
+            "tensor_parallel_size": 4,
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1,
+            "use_v2_block_manager": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200 
+        }
+    }
+]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -0,0 +1,35 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -0,0 +1,72 @@
+steps:
+  - label: "Build wheel - CUDA 12.1"
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+  # However, this block can be uncommented to save some compute hours.
+  # - block: "Build CUDA 11.8 wheel"
+  #   key: block-build-cu118-wheel
+
+  - label: "Build wheel - CUDA 11.8"
+    # depends_on: block-build-cu118-wheel
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build release image"
+    depends_on: ~
+    key: block-release-image-build
+
+  - label: "Build release image"
+    depends_on: block-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build and publish TPU release image"
+    depends_on: ~
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: tpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "docker push vllm/vllm-tpu:nightly"
+      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllm
+          password-env: DOCKERHUB_TOKEN
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+    env:
+      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -1,7 +1,18 @@
+#!/bin/bash
+
 # This script runs test inside the corresponding ROCm docker container.
-set -ex
+set -o pipefail

 # Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- ROCm info"
 rocminfo

@ -22,8 +33,8 @@ cleanup_docker() {
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
-    # Remove unused volumes
-    docker volume prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
@ -45,29 +56,101 @@ while true; do
        fi
 done

-echo "--- Building container"
-sha=$(git rev-parse --short HEAD)
-image_name=rocm_${sha}
-container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
-docker build \
-        -t ${image_name} \
-        -f Dockerfile.rocm \
-        --progress plain \
-        .
+echo "--- Pulling container" 
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull "${image_name}"

 remove_docker_container() {
-   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

 echo "--- Running container"

-docker run \
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+commands=$@
+echo "Commands:$commands"
+#ignore certain kernels tests
+if [[ $commands == *" kernels "* ]]; then
+  commands="${commands} \
+  --ignore=kernels/test_attention.py \
+  --ignore=kernels/test_attention_selector.py \
+  --ignore=kernels/test_blocksparse_attention.py \
+  --ignore=kernels/test_causal_conv1d.py \
+  --ignore=kernels/test_cutlass.py \
+  --ignore=kernels/test_encoder_decoder_attn.py \
+  --ignore=kernels/test_flash_attn.py \
+  --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_int8_quant.py \
+  --ignore=kernels/test_machete_gemm.py \
+  --ignore=kernels/test_mamba_ssm.py \
+  --ignore=kernels/test_marlin_gemm.py \
+  --ignore=kernels/test_moe.py \
+  --ignore=kernels/test_prefix_prefill.py \
+  --ignore=kernels/test_rand.py \
+  --ignore=kernels/test_sampler.py"
+fi
+
+#ignore certain Entrypoints tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_accuracy.py \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_encoder_decoder.py \
+  --ignore=entrypoints/openai/test_embedding.py \
+  --ignore=entrypoints/openai/test_oot_registration.py "}
+fi
+
+PARALLEL_JOB_COUNT=8
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used   
+  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    echo "Shard ${GPU} commands:$commands_gpu"
+    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
+        --shm-size=16gb \
        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
-        --name ${container_name} \
-        ${image_name} \
-        /bin/bash -c "${@}"
-
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    fi
+  done
+else
+  docker run \
+          --device /dev/kfd --device /dev/dri \
+          --network host \
+          --shm-size=16gb \
+          --rm \
+          -e HIP_VISIBLE_DEVICES=0 \
+          -e HF_TOKEN \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
+fi
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite

 set -ex
@ -50,16 +52,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
 echo '```' >> benchmark_results.md

 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /workspace/buildkite-agent ]; then
+if [ ! -f /usr/bin/buildkite-agent ]; then
    exit 0
 fi

 # upload the results to buildkite
-/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@ -75,4 +77,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
 fi

 rm ShareGPT_V3_unfiltered_cleaned_split.json
-/workspace/buildkite-agent artifact upload "*.json"
+buildkite-agent artifact upload "*.json"
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.ppc64le .
+
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -1,14 +1,88 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex

+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Run the image and launch offline inference
-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
+# Run the image, setting --shm-size=4g for tensor parallel.
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  # offline inference
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
+    set -e
+    python3 examples/offline_inference/basic.py"
+
+  # Run basic model test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pip install -r vllm/requirements-test.txt
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+
+  # Run compressed-tensor test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+
+  # Run AWQ test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_ipex_quant.py"
+
+  # Run chunked-prefill and prefix-cache test
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v -k cpu_model \
+    tests/basic_correctness/test_chunked_prefill.py"  
+
+  # online serving
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    export VLLM_CPU_KVCACHE_SPACE=10 
+    export VLLM_CPU_OMP_THREADS_BIND=$1
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
+    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+    python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model facebook/opt-125m \
+      --num-prompts 20 \
+      --endpoint /v1/completions \
+      --tokenizer facebook/opt-125m"
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/lora/test_qwen2vl.py"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
+python3 use_existing_torch.py
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/basic.py
+'
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t hpu-test-env -f Dockerfile.hpu .
+
+# Setup cleanup
+# certain versions of HPU software stack have a bug that can
+# override the exit code of the script, so we need to use
+# separate remove_docker_container and remove_docker_container_and_exit
+# functions, while other platforms only need one remove_docker_container
+# function.
+EXITCODE=1
+remove_docker_container() { docker rm -f hpu-test || true; }
+remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
+trap remove_docker_container_and_exit EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+EXITCODE=$?
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@ -0,0 +1,108 @@
+#!/bin/bash
+
+set -euox pipefail
+
+if [[ $# -lt 4 ]]; then
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    exit 1
+fi
+
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
+
+shift 4
+COMMANDS=("$@")
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
+    echo "The number of commands must be equal to the number of nodes."
+    echo "Number of nodes: $NUM_NODES"
+    echo "Number of commands: ${#COMMANDS[@]}"
+    exit 1
+fi
+
+echo "List of commands"
+for command in "${COMMANDS[@]}"; do
+    echo "$command"
+done
+
+start_network() {
+    docker network create --subnet=192.168.10.0/24 docker-net
+}
+
+start_nodes() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+
+        # start the container in detached mode
+        # things to note:
+        # 1. --shm-size=10.24gb is required. don't use --ipc=host
+        # 2. pass HF_TOKEN to the container
+        # 3. map the huggingface cache directory to the container
+        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+        #    starting from 192.168.10.11)
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
+            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
+            /bin/bash -c "tail -f /dev/null"
+
+        # organize containers into a ray cluster
+        if [ "$node" -eq 0 ]; then
+            # start the ray head node
+            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
+            # wait for the head node to be ready
+            sleep 10
+        else
+            # start the ray worker nodes, and connect them to the head node
+            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+        fi
+    done
+
+    # wait for the cluster to be ready
+    sleep 10
+
+    # print the cluster status
+    docker exec node0 /bin/bash -c "ray status"
+}
+
+run_nodes() {
+    # important: iterate in reverse order to start the head node last
+    # we start the worker nodes first, in detached mode, and then start the head node
+    # in the foreground, so that the output of the head node is visible in the buildkite logs
+    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+        echo "Running node$node with GPU devices: $GPU_DEVICES"
+        if [ "$node" -ne 0 ]; then
+            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        else
+            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        fi
+    done
+}
+cleanup() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        docker stop "node$node"
+    done
+    docker network rm docker-net
+}
+trap cleanup EXIT
+start_network
+start_nodes
+run_nodes
+
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -1,6 +1,20 @@
+#!/bin/bash
+
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"

 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@ -11,41 +25,33 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
-        docker system prune -f
-        echo $current_time > /tmp/neuron-docker-build-timestamp
+        # Remove dangling images (those that are not tagged and not used by any container)
+        docker image prune -f
+        # Remove unused volumes / force the system prune for old images as well.
+        docker volume prune -f && docker system prune -f
+        # Remove huggingface model artifacts and compiler cache
+        rm -rf "${HF_MOUNT:?}/*"
+        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
+        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
-    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t neuron -f Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .

 # Setup cleanup
-remove_docker_container() { docker rm -f neuron || true; }
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
-remove_docker_container

 # Run the image
-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
-
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t xpu-test -f Dockerfile.xpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f xpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference/tensor parallel
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/cli.py -tp 2
+'
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -1,166 +1,615 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template.j2` to generate
-# the final pipeline yaml file.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# to generate the final pipeline yaml file.
+
+# Documentation
+# label(str): the name of the test. emoji allowed.
+# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# fast_check_only(bool): run this test on fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# command(str): the single command to run for tests. incompatible with commands.
+# commands(list): the list of commands to run for test. incompatbile with command.
+# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
+# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
+# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
+#     in this case, commands must be specified. the first command runs on first host, the second
+#     command runs on the second host.
+# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+
+# When adding a test
+# - If the test belong to an existing group, add it there
+# - If the test is short, add to any existing step
+# - If the test takes more than 10min, then it is okay to create a new step. 
+#   Note that all steps execute in parallel. 

 steps:
- label: Regression Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
+##### fast check tests  #####

- label: AsyncEngine Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s async_engine
-
- label: Basic Correctness Test
-  mirror_hardwares: [amd]
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
- label: Core Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s core
-
- label: Distributed Comm Ops Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s distributed/test_comm_ops.py
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-
- label: Distributed Tests
-  mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  commands:
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py 
-
- label: Distributed Tests (Multiple Groups)
-  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  commands:
-  - pytest -v -s distributed/test_pynccl.py
-
- label: Engine Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
-
- label: Entrypoints Test
-  mirror_hardwares: [amd]
-
-  commands:
-  - pytest -v -s test_inputs.py
-  - pytest -v -s entrypoints -m llm
-  - pytest -v -s entrypoints -m openai
-
- label: Examples Test
-  working_dir: "/vllm-workspace/examples"
-  mirror_hardwares: [amd]
-  commands:
-    # install aws cli for llava_example.py
-    # install tensorizer for tensorize_vllm_model.py
-    - pip install awscli tensorizer
-    - python3 offline_inference.py
-    - python3 offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
-    - python3 llava_example.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
-
- label: Models Test
-  #mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models --ignore=models/test_llava.py
-
- label: Llava Test
-  mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models/test_llava.py
-
- label: Prefix Caching Test
-  mirror_hardwares: [amd]
-  commands:
-    - pytest -v -s prefix_caching
-
- label: Samplers Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s samplers
-
- label: LogitsProcessor Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s test_logits_processor.py
-
- label: Utils Test
-  command: pytest -v -s test_utils.py
-
- label: Worker Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s worker
-
- label: Speculative decoding tests
-  #mirror_hardwares: [amd]
-  command: pytest -v -s spec_decode
-
- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    # Temporarily run this way because we cannot clean up GPU mem usage
-    # for multi GPU tests.
-    # TODO(sang): Fix it.
-    - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
-    - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
-    - pytest -v -s lora/test_long_context.py::test_self_consistency
-    - pytest -v -s lora/test_long_context.py::test_quality
-    - pytest -v -s lora/test_long_context.py::test_max_len
-
- label: Tensorizer Test
-  #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
-
- label: Metrics Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s metrics
-
- label: Quantization Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s quantization
-
- label: Benchmarks
-  working_dir: "/vllm-workspace/.buildkite"
-  mirror_hardwares: [amd]
-  commands:
-  - pip install aiohttp
-  - bash run-benchmarks.sh
-
- label: Documentation Build
+- label: Documentation Build # 2min
  working_dir: "/vllm-workspace/test_docs/docs"
+  fast_check: true
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
+  # Check API reference (if it fails, you may have missing mock imports)
+  - grep \"sig sig-object py\" build/html/api/inference_params.html
+
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/mq_llm_engine
+  - tests/async_engine
+  - tests/test_inputs
+  - tests/multimodal
+  - tests/test_utils
+  - tests/worker
+  - tests/standalone_tests/lazy_torch_compile.py
+  commands:
+  - python3 standalone_tests/lazy_torch_compile.py
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
+  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s multimodal
+  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s worker # Worker
+
+- label: Python-only Installation Test
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Basic Correctness Test # 30min
+  #mirror_hardwares: [amd]
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_preemption
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Chunked Prefill Test
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_chunked_prefill
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+
+- label: Core Test # 10min
+  mirror_hardwares: [amd]
+  fast_check: true
+  source_file_dependencies:
+  - vllm/core
+  - vllm/distributed
+  - tests/core
+  commands:
+  - pytest -v -s core
+
+- label: Entrypoints Test # 40min
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  commands:
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/test_chat_utils.py
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Distributed Tests (4 GPUs) # 10min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  fast_check: true
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/core/
+  - tests/distributed
+  - tests/spec_decode/e2e/test_integration_dist_tp4
+  - tests/compile
+  - examples/offline_inference/rlhf.py
+  commands:
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - python3 ../examples/offline_inference/rlhf.py
+
+- label: Metrics, Tracing Test # 10min
+  num_gpus: 2 
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/metrics
+  - tests/tracing
+  commands:
+  - pytest -v -s metrics 
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
+  - pytest -v -s tracing
+
+##### fast check tests  #####
+#####  1 GPU test  #####
+
+- label: Regression Test # 5min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Engine Test # 10min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/tokenization
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
+
+- label: V1 Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # split the test to avoid interference
+    - VLLM_USE_V1=1 pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/engine
+    - VLLM_USE_V1=1 pytest -v -s v1/sample
+    - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
+    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - VLLM_USE_V1=1 pytest -v -s v1/e2e
+
+- label: Examples Test # 25min
+  working_dir: "/vllm-workspace/examples"
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/entrypoints
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic.py
+    - python3 offline_inference/cpu_offload.py
+    - python3 offline_inference/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/vision_language_multi_image.py
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/classification.py
+    - python3 offline_inference/embedding.py
+    - python3 offline_inference/scoring.py
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+
+- label: Prefix Caching Test # 9min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/prefix_caching
+  commands:
+    - pytest -v -s prefix_caching
+
+- label: Samplers Test # 36min
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+
+- label: LogitsProcessor Test # 5min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
+  - tests/test_logits_processor
+  - tests/model_executor/test_guided_processors
+  commands: 
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
+
+- label: Speculative decoding tests # 40min
+  source_file_dependencies:
+  - vllm/spec_decode
+  - tests/spec_decode
+  - vllm/model_executor/models/eagle.py
+  commands:
+    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
+
+- label: LoRA Test %N # 15min each
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
+  parallelism: 4
+
+- label: "PyTorch Fullgraph Smoke Test" # 9min
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_basic_correctness.py
+  # these tests need to be separated, cannot combine
+  - pytest -v -s compile/piecewise/test_simple.py
+  - pytest -v -s compile/piecewise/test_toy_llama.py
+
+- label: "PyTorch Fullgraph Test" # 18min
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py
+
+- label: Kernels Test %N # 1h each
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/
+  - vllm/attention
+  - tests/kernels
+  commands:
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4
+
+- label: Tensorizer Test # 11min
+  mirror_hardwares: [amd]
+  soft_fail: true
+  source_file_dependencies:
+  - vllm/model_executor/model_loader
+  - tests/tensorizer_loader
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
+
+- label: Benchmarks # 9min
+  working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash run-benchmarks.sh
+
+- label: Quantization Test # 33min
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+
+- label: LM Eval Small Models # 53min
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+
+- label: Encoder Decoder tests # 5min
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
+
+- label: OpenAI-Compatible Tool Use # 20 min
+  fast_check: false
+  mirror_hardwares: [ amd ]
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s tool_use
+
+#####  models test  #####
+
+- label: Basic Models Test # 24min
+  source_file_dependencies:
+  - vllm/
+  - tests/models
+  commands:
+    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_initialization.py
+
+- label: Language Models Test (Standard) # 32min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/language -m core_model
+
+- label: Language Models Test (Extended) # 1h10min
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/language -m 'not core_model'
+
+- label: Multi-Modal Models Test (Standard) # 40min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
+  - tests/models/encoder_decoder/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal
+    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
+    - pytest -v -s models/encoder_decoder/language -m core_model
+    - pytest -v -s models/encoder_decoder/vision_language -m core_model
+
+- label: Multi-Modal Models Test (Extended) 1 # 48m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
+
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
+
+#####  1 GPU test  #####
+#####  multi gpus test  #####
+
+- label: Distributed Comm Ops Test # 7min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+
+- label: 2 Node Tests (4 GPUs in total) # 16min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+
+- label: Distributed Tests (2 GPUs) # 40min
+  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
+  commands:
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
+  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  # this test fails consistently.
+  # TODO: investigate and fix
+  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+
+- label: Plugin Tests (2 GPUs) # 40min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  fast_check: true
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+
+- label: Multi-step Tests (4 GPUs) # 36min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/model_executor/layers/sampler.py
+  - vllm/sequence.py
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/multi_step_worker.py
+  - vllm/worker/model_runner_base.py
+  - vllm/worker/model_runner.py
+  - vllm/worker/multi_step_model_runner.py
+  - vllm/engine
+  - tests/multi_step
+  commands:
+  # this test is quite flaky
+  # TODO: investigate and fix.
+  # - pytest -v -s multi_step/test_correctness_async_llm.py
+  - pytest -v -s multi_step/test_correctness_llm.py
+
+- label: Pipeline Parallelism Test # 45min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: LoRA TP Test (Distributed)
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # This test runs llama 13B, so it is required to run on 4 GPUs.
+    - pytest -v -s -x lora/test_long_context.py
+    # There is some Tensor Parallelism related processing logic in LoRA that 
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
+
+
+- label: Weight Loading Multiple GPU Test  # 33min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
+
+
+##### multi gpus test #####
+##### A100 test #####
+
+- label: Distributed Tests (A100) # optional
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands: 
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: LM Eval Large Models # optional
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -1,93 +0,0 @@
-{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
-{% set default_num_gpu = 1 %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    commands: 
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: true
-
-  - label: "Intel Test"
-    depends_on: ~
-    command: bash .buildkite/run-cpu-test.sh
-
-  {% for step in steps %}
-  - label: "{{ step.label }}"
-    agents:
-      queue: kubernetes
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - kubernetes:
-          podSpec:
-            {% if step.num_gpus %}
-            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
-            {% endif %}
-            volumes:
-              - name: dshm
-                emptyDir:
-                  medium: Memory
-            containers:
-              - image: "{{ docker_image }}"
-                command: ["bash"]
-                args:
-                - '-c'
-                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
-                {% if not step.no_gpu %}
-                resources:
-                  requests:
-                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
-                  limits:
-                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
-                {% endif %}
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                volumeMounts:
-                  - mountPath: /dev/shm
-                    name: dshm
-  {% endfor %}
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+  exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename
+new_wheel="${wheel/linux/manylinux1}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version: $version"
+
+normal_wheel="$wheel" # Save the original wheel filename
+
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    suffix="${version##*.}"
+    if [[ $suffix == cu* ]]; then
+        new_version="1.0.0.dev+${suffix}"
+    else
+        new_version="1.0.0.dev"
+    fi
+    new_wheel="${wheel/$version/$new_version}"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi
+
+# Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
+
+# generate index for nightly
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
+
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.dockerignore
+++ b/.dockerignore
@ -1 +1,33 @@
+/.venv
+/build
+dist
 vllm/*.so
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+.mypy_cache
+
+# Distribution / packaging
+.Python
+/build/
+cmake-build-*/
+CMakeUserPresets.json
+develop-eggs/
+/dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1,36 @@
+# See https://help.github.com/articles/about-codeowners/
+# for more info about CODEOWNERS file
+
+# This lists cover the "core" components of vLLM that require careful review
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
+/vllm/model_executor/guided_decoding @mgoin
+/vllm/multimodal @DarkLight1337 @ywang96
+CMakeLists.txt @tlrmchlsmth
+
+# vLLM V1
+/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
+
+# Test ownership
+/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
+/tests/test_inputs.py @DarkLight1337 @ywang96
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
+/tests/models @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96
+/tests/prefix_caching @comaniac @KuntaiDu
+/tests/spec_decode @njhill @LiuXiaoxuanPKU
+/tests/kernels @tlrmchlsmth @WoosukKwon
+/tests/quantization @mgoin @robertgshaw2-redhat
+/.buildkite/lm-eval-harness @mgoin @simon-mo
+/tests/distributed/test_multi_node_assignment.py @youkaichao
+/tests/distributed/test_pipeline_parallel.py @youkaichao
+/tests/distributed/test_same_node.py @youkaichao
+/tests/multi_step @alexm-redhat @comaniac
+/tests/weight_loading @mgoin @youkaichao
+/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -0,0 +1,2 @@
+github: [vllm-project]
+open_collective: vllm
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@ -20,3 +20,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -38,3 +38,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -36,3 +36,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -20,11 +20,25 @@ body:
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
+      <details>
+      <summary>The output of `python collect_env.py`</summary>
+
      ```text
-      The output of `python collect_env.py`
+      Your output of `python collect_env.py` here
      ```
+      
+      </details>
  validations:
    required: true
+- type: textarea
+  attributes:
+    label: Model Input Dumps
+    description: |
+      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
+    placeholder: |
+      Upload the dumped input file.
+  validations:
+    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
@ -84,3 +98,10 @@ body:
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.

      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/500-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml
@ -29,3 +29,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).

-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
@ -31,3 +31,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -50,3 +50,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -47,3 +47,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -19,3 +19,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -2,63 +2,4 @@ FILL IN THE PR DESCRIPTION HERE

 FIX #xxxx (*link existing issues this PR will resolve*)

-**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
-
---
-
-<details>
-<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
-<summary><b> PR Checklist (Click to Expand) </b></summary>
-
-<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
-
-<h3>PR Title and Classification</h3>
-<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
-<ul>
-    <li><code>[Bugfix]</code> for bug fixes.</li>
-    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
-    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
-    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
-    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
-    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
-    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
-    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
-    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
-</ul>
-<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
-
-<h3>Code Quality</h3>
-
-<p>The PR need to meet the following code quality standards:</p>
-
-<ul>
-    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
-    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
-    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
-    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
-    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
-</ul>
-
-<h3>Notes for Large Changes</h3>
-<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
-
-<h3>What to Expect for the Reviews</h3>
-
-<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
-
-<ul>
-    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
-    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
-    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
-    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
- </li>
-</ul>
-
-<h3>Thank You</h3>
-
-<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
-
-
-</details>
-
-
+**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,31 @@
+version: 2
+updates:
+  # Maintain dependencies for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    labels: ["dependencies"]
+    open-pull-requests-limit: 5
+    reviewers: ["khluu", "simon-mo"]
+    allow:
+      - dependency-type: "all"
+    ignore:
+      - dependency-name: "*"
+        update-types: ["version-update:semver-patch"]
+      - dependency-name: "torch"
+      - dependency-name: "torchvision"
+      - dependency-name: "xformers"
+      - dependency-name: "lm-format-enforcer"
+      - dependency-name: "gguf"
+      - dependency-name: "compressed-tensors"
+      - dependency-name: "ray[adag]"
+      - dependency-name: "lm-eval"
+    groups:
+      minor-update:
+        applies-to: version-updates
+        update-types: ["minor"]
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -0,0 +1,60 @@
+pull_request_rules:
+- name: label-documentation
+  description: Automatically apply documentation label
+  conditions:
+    - or:
+      - files~=^[^/]+\.md$
+      - files~=^docs/
+  actions:
+    label:
+      add:
+        - documentation
+
+- name: label-ci-build
+  description: Automatically apply ci/build label
+  conditions:
+    - or:
+      - files~=^\.github/
+      - files~=\.buildkite/
+      - files~=^cmake/
+      - files=CMakeLists.txt
+      - files~=^Dockerfile
+      - files~=^requirements.*\.txt
+      - files=setup.py
+  actions:
+    label:
+      add:
+        - ci/build
+
+- name: label-frontend
+  description: Automatically apply frontend label
+  conditions:
+    - files~=^vllm/entrypoints/
+  actions:
+    label:
+      add:
+        - frontend
+
+- name: ping author on conflicts and add 'needs-rebase' label
+  conditions:
+      - conflict
+      - -closed
+  actions:
+    label:
+      add:
+        - needs-rebase
+    comment:
+      message: |
+       This pull request has merge conflicts that must be resolved before it can be
+       merged. Please rebase the PR, @{{author}}.
+
+       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+  conditions:
+      - -conflict
+      - -closed
+  actions:
+    label:
+      remove:
+        - needs-rebase
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+
+set -eu
+
+# ensure 1 argument is passed
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <pr_number>"
+    exit 1
+fi
+
+PR_NUMBER=$1
+OLD=/tmp/orig_pr_body.txt
+NEW=/tmp/new_pr_body.txt
+
+gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
+cp "${OLD}" "${NEW}"
+
+# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
+sed -i '/FIX #xxxx.*$/d' "${NEW}"
+
+# Remove "FILL IN THE PR DESCRIPTION HERE"
+sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
+
+# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
+python3 - <<EOF
+import re
+
+with open("${NEW}", "r") as file:
+    content = file.read()
+
+pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
+content = re.sub(pattern, '', content)
+
+with open("${NEW}", "w") as file:
+    file.write(content)
+EOF
+
+# Run this only if ${NEW} is different than ${OLD}
+if ! cmp -s "${OLD}" "${NEW}"; then
+    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+    echo
+    echo "Updated PR body:"
+    echo
+    cat "${NEW}"
+else
+    echo "No changes needed"
+fi
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -0,0 +1,21 @@
+name: Add label on auto-merge enabled
+on:
+    pull_request_target:
+        types:
+            - auto_merge_enabled
+jobs:
+    add-label-on-auto-merge:
+        runs-on: ubuntu-latest
+        steps:
+            -   name: Add label
+                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+                with:
+                    script: |
+                        github.rest.issues.addLabels({
+                            owner: context.repo.owner,
+                            repo: context.repo.repo,
+                            issue_number: context.issue.number,
+                            labels: ['ready']
+                        })
+                env:
+                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -1,42 +0,0 @@
-name: clang-format
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-
-jobs:
-  clang-format:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.11"]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install clang-format==18.1.5
-    - name: Running clang-format
-      run: |
-        EXCLUDES=(
-            'csrc/moe/topk_softmax_kernels.cu'
-            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
-            'csrc/punica/bgmv/bgmv_config.h'
-            'csrc/punica/bgmv/bgmv_impl.cuh'
-            'csrc/punica/bgmv/vec_dtypes.cuh'
-            'csrc/punica/punica_ops.cu'
-            'csrc/punica/type_convert.h'
-        )
-        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
-            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
-            | xargs clang-format --dry-run --Werror
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -0,0 +1,26 @@
+name: Cleanup PR Body
+
+on:
+  pull_request_target:
+    types: [opened, reopened, edited]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  update-description:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: '3.12'
+
+      - name: Update PR description
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -0,0 +1,82 @@
+name: Lint and Deploy Charts
+
+on: pull_request
+
+jobs:
+  lint-and-deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Set up Helm
+        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
+        with:
+          version: v3.14.4
+
+       #Python is required because ct lint runs Yamale and yamllint which require Python.
+      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: '3.13'
+
+      - name: Set up chart-testing
+        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
+        with:
+          version: v3.10.1
+
+      - name: Run chart-testing (lint)
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
+
+      - name: Setup minio
+        run: |
+          docker network create vllm-net
+          docker run -d -p 9000:9000 --name minio --net vllm-net \
+                     -e "MINIO_ACCESS_KEY=minioadmin" \
+                     -e "MINIO_SECRET_KEY=minioadmin" \
+                     -v /tmp/data:/data \
+                     -v /tmp/config:/root/.minio \
+                     minio/minio server /data
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          export AWS_EC2_METADATA_DISABLED=true
+          mkdir opt-125m
+          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
+          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
+          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
+
+      - name: Create kind cluster
+        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
+
+      - name: Build the Docker image vllm cpu
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+
+      - name: Configuration of docker images, network and namespace for the kind cluster
+        run: |
+          docker pull amazon/aws-cli:2.6.4
+          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
+          kind load docker-image vllm-cpu-env:latest --name chart-testing
+          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
+          kubectl create ns ns-vllm
+
+      - name: Run chart-testing (install)
+        run: |
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+    
+      - name: curl test
+        run: |
+          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
+          sleep 10
+          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
+                  --header "Content-Type: application/json" \
+                  --data '{
+                          "model": "opt-125m",
+                          "prompt": "San Francisco is a",
+                          "max_tokens": 7,
+                          "temperature": 0
+                  }'):$CODE"
+          echo "$CODE"
--- a/.github/workflows/matchers/actionlint.json
+++ b/.github/workflows/matchers/actionlint.json
@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "actionlint",
+      "pattern": [
+        {
+          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4,
+          "code": 5
+        }
+      ]
+    }
+  ]
+}
--- a/.github/workflows/matchers/mypy.json
+++ b/.github/workflows/matchers/mypy.json
@ -0,0 +1,16 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "mypy",
+      "pattern": [
+        {
+          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+          "file": 1,
+          "line": 2,
+          "severity": 3,
+          "message": 4
+        }
+      ]
+    }
+  ]
+}
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -1,50 +0,0 @@
-name: mypy
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-
-jobs:
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install mypy==1.9.0
-        pip install types-setuptools
-        pip install types-PyYAML
-        pip install types-requests
-        pip install types-setuptools
-    - name: Mypy
-      run: |
-        mypy vllm/attention --config-file pyproject.toml
-        mypy vllm/core --config-file pyproject.toml
-        mypy vllm/distributed --config-file pyproject.toml
-        mypy vllm/entrypoints --config-file pyproject.toml
-        mypy vllm/executor --config-file pyproject.toml
-        mypy vllm/usage --config-file pyproject.toml
-        mypy vllm/*.py --config-file pyproject.toml
-        mypy vllm/transformers_utils --config-file pyproject.toml
-        mypy vllm/engine  --config-file pyproject.toml
-        mypy vllm/worker --config-file pyproject.toml
-        mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/model_executor  --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
-        mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/model_executor --config-file pyproject.toml
-
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,19 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      with:
+        python-version: "3.12"
+    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      with:
+        extra_args: --all-files --hook-stage manual
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -21,16 +21,16 @@ jobs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Extract branch info
        shell: bash
        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"

      - name: Create Release
        id: create_release
-        uses: "actions/github-script@v6"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
@ -39,67 +39,68 @@ jobs:
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)

-  wheel:
-    name: Build Wheel
-    runs-on: ${{ matrix.os }}
-    needs: release
+  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
+  # wheel:
+  #   name: Build Wheel
+  #   runs-on: ${{ matrix.os }}
+  #   needs: release

-    strategy:
-      fail-fast: false
-      matrix:
-          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
-          cuda-version: ['11.8', '12.1']
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #         os: ['ubuntu-20.04']
+  #         python-version: ['3.9', '3.10', '3.11', '3.12']
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+  #         cuda-version: ['11.8', '12.1']

-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@v1.2
-        with:
-          create-symlink: true
-          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+  #     - name: Setup ccache
+  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
+  #       with:
+  #         create-symlink: true
+  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}

-      - name: Set up Linux Env
-        if: ${{ runner.os == 'Linux' }}
-        run: |
-          bash -x .github/workflows/scripts/env.sh
+  #     - name: Set up Linux Env
+  #       if: ${{ runner.os == 'Linux' }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/env.sh

-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-            python-version: ${{ matrix.python-version }}
+  #     - name: Set up Python
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+  #       with:
+  #           python-version: ${{ matrix.python-version }}

-      - name: Install CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+  #     - name: Install CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}

-      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}

-      - name: Build wheel
-        shell: bash
-        env:
-          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-        run: |
-          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
-          asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
-          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+  #     - name: Build wheel
+  #       shell: bash
+  #       env:
+  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+  #       run: |
+  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+  #         asset_name=${wheel_name//"linux"/"manylinux1"}
+  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"

-      - name: Upload Release Asset
-        uses: actions/upload-release-asset@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
-          asset_path: ./dist/${{ env.wheel_name }}
-          asset_name: ${{ env.asset_name }}
-          asset_content_type: application/*
+  #     - name: Upload Release Asset
+  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       with:
+  #         upload_url: ${{ needs.release.outputs.upload_url }}
+  #         asset_path: ./dist/${{ env.wheel_name }}
+  #         asset_name: ${{ env.asset_name }}
+  #         asset_content_type: application/*

      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
      # - name: Publish package
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -0,0 +1,21 @@
+name: PR Reminder Comment Bot
+on:
+  pull_request_target:
+    types: [opened]
+
+jobs:
+  pr_reminder:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remind to run full CI on PR
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        with:
+          script: |
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+            })
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -1,37 +0,0 @@
-name: ruff
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-
-jobs:
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
-    - name: Analysing the code with ruff
-      run: |
-        ruff .
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
-    - name: Run isort
-      run: |
-        isort . --check-only
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -eux

 python_executable=python$1
 cuda_home=/usr/local/cuda-$2
@ -8,14 +9,15 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH

 # Install requirements
-$python_executable -m pip install wheel packaging
-$python_executable -m pip install -r requirements-cuda.txt
+$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
-# Make sure punica is built for the release (for LoRA)
-export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
+
+bash tools/check_repo.sh
+
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@ -1,16 +1,16 @@
 #!/bin/bash

 # Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
+cuda_version=$(echo "$1" | tr "." "-")
 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
+OS=$(echo "$2" | tr -d ".\-")

 # Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
 rm cuda-keyring_1.1-1_all.deb
 sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
 sudo apt clean

 # Test nvcc
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@ -6,7 +6,7 @@ cuda_version=$3

 # Install torch
 $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"

 # Print version information
 $python_executable --version
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -0,0 +1,52 @@
+name: 'Close inactive issues and PRs'
+
+on:
+  schedule:
+    # Daily at 1:30 AM UTC
+    - cron: '30 1 * * *'
+
+jobs:
+  close-issues-and-pull-requests:
+    permissions:
+      issues: write
+      pull-requests: write
+      actions: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+        with:
+          # Increasing this value ensures that changes to this workflow
+          # propagate to all issues and PRs in days rather than months
+          operations-per-run: 1000
+
+          exempt-draft-pr: true
+          exempt-issue-labels: 'keep-open'
+          exempt-pr-labels: 'keep-open'
+
+          labels-to-add-when-unstale: 'unstale'
+          labels-to-remove-when-stale: 'unstale'
+
+          days-before-issue-stale: 90
+          days-before-issue-close: 30
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not
+            had any activity within 90 days. It will be automatically closed if no
+            further activity occurs within 30 days. Leave a comment if
+            you feel this issue should remain open. Thank you!
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity. Please
+            feel free to reopen if you feel it is still relevant. Thank you!
+
+          days-before-pr-stale: 90
+          days-before-pr-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it
+            has not had any activity within 90 days. It will be automatically
+            closed if no further activity occurs within 30 days. Leave a comment
+            if you feel this pull request should remain open. Thank you!
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it.
+            Thank you!
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -1,31 +0,0 @@
-name: yapf
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-jobs:
-  yapf:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install yapf==0.32.0
-        pip install toml==0.10.2
-    - name: Running yapf
-      run: |
-        yapf --diff --recursive .
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,9 @@
+# version file generated by setuptools-scm
+/vllm/_version.py
+
+# vllm-flash-attn built from source
+vllm/vllm_flash_attn/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@ -9,6 +15,8 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
+cmake-build-*/
+CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/
@ -25,6 +33,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+/.deps/

 # PyInstaller
 #  Usually these files are written by a python script from a template
@ -70,8 +79,7 @@ instance/

 # Sphinx documentation
 docs/_build/
-docs/source/getting_started/examples/*.rst
-!**/*.template.rst
+docs/source/getting_started/examples/

 # PyBuilder
 .pybuilder/
@ -84,6 +92,9 @@ target/
 profile_default/
 ipython_config.py

+# generated files
+**/generated/**
+
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
@ -186,4 +197,8 @@ _build/
 hip_compat.h

 # Benchmark dataset
-*.json
+benchmarks/*.json
+
+# Linting
+actionlint
+shellcheck*/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,93 @@
+default_stages:
+  - pre-commit # Run locally
+  - manual # Run in CI
+repos:
+- repo: https://github.com/google/yapf
+  rev: v0.43.0
+  hooks:
+  - id: yapf
+    args: [--in-place, --verbose]
+    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.9.3
+  hooks:
+  - id: ruff
+    args: [--output-format, github]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.4.0
+  hooks:
+  - id: codespell
+    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
+- repo: https://github.com/PyCQA/isort
+  rev: 5.13.2
+  hooks:
+  - id: isort
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v19.1.7
+  hooks:
+  - id: clang-format
+    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
+    types_or: [c++, cuda]
+    args: [--style=file, --verbose]
+- repo: https://github.com/jackdewinter/pymarkdown
+  rev: v0.9.27
+  hooks:
+  - id: pymarkdown
+    files: docs/.*
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.7
+  hooks:
+  - id: actionlint
+- repo: local
+  hooks:
+  - id: mypy-local
+    name: Run mypy for local Python installation
+    entry: tools/mypy.sh 0 "local"
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    stages: [pre-commit] # Don't run in CI
+  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.9
+    entry: tools/mypy.sh 1 "3.9"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.10
+    entry: tools/mypy.sh 1 "3.10"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.11
+    entry: tools/mypy.sh 1 "3.11"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.12
+    entry: tools/mypy.sh 1 "3.12"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
+  - id: shellcheck
+    name: Lint shell scripts
+    entry: tools/shellcheck.sh
+    language: script
+    types: [shell]
+  - id: png-lint
+    name: Lint PNG exports from excalidraw
+    entry: tools/png-lint.sh
+    language: script
+    types: [png]
+  - id: suggestion
+    name: Suggestion
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    language: system
+    verbose: true
+    pass_filenames: false
--- a/Show More
+++ b/Show More