add compile

2025-10-20 23:03:52 +08:00 · 2024-07-26 19:29:36 -07:00
1752 changed files with 49894 additions and 249480 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,48 +1,36 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import os
-import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
-# Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
+MAX_SIZE_MB = 250


 def print_top_10_largest_files(zip_file):
-    """Print the top 10 largest files in the given zip file."""
    with zipfile.ZipFile(zip_file, 'r') as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
-            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
+            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")


 def check_wheel_size(directory):
-    """Check the size of .whl files in the given directory."""
    for root, _, files in os.walk(directory):
-        for file_name in files:
-            if file_name.endswith(".whl"):
-                wheel_path = os.path.join(root, file_name)
-                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
-                if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(f"Not allowed: Wheel {wheel_path} is larger "
-                          f"({wheel_size_mb:.2f} MB) than the limit "
-                          f"({VLLM_MAX_SIZE_MB} MB).")
+        for f in files:
+            if f.endswith(".whl"):
+                wheel_path = os.path.join(root, f)
+                wheel_size = os.path.getsize(wheel_path)
+                wheel_size_mb = wheel_size / (1024 * 1024)
+                if wheel_size_mb > MAX_SIZE_MB:
+                    print(
+                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
                    print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb:.2f} MB).")
+                          f"({wheel_size_mb} MB).")
    return 0


 if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python check-wheel-size.py <directory>")
-        sys.exit(1)
-
-    directory = sys.argv[1]
-    sys.exit(check_wheel_size(directory))
+    import sys
+    sys.exit(check_wheel_size(sys.argv[1]))
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(wheel=filename,
-                        wheel_html_escaped=filename.replace("+", "%2B")))
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -9,4 +9,3 @@ tasks:
    value: 0.664
 limit: 1000
 num_fewshot: 5
-trust_remote_code: True
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
-model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.764
-  - name: "exact_match,flexible-extract"
-    value: 0.764
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "HandH1998/QQQ-Llama-3-8b-g128"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.419
-  - name: "exact_match,flexible-extract"
-    value: 0.416
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
-model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.356
-  - name: "exact_match,flexible-extract"
-    value: 0.358
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
-model_name: "mgoin/Minitron-4B-Base-FP8"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
+model_name: "nvidia/Minitron-4B-Base"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.233
+    value: 0.252
  - name: "exact_match,flexible-extract"
-    value: 0.236
+    value: 0.252
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
-model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.6353
-  - name: "exact_match,flexible-extract"
-    value: 0.637
-limit: null
-num_fewshot: null 
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,10 +1,9 @@
 Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
-Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
+Minitron-4B-Base.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
-Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10

 usage() {
    echo``
@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
 done

 lm_eval --model hf \
-  --model_args "pretrained=$MODEL,parallelize=True" \
-  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
-  --batch_size "$BATCH_SIZE"
+  --model_args pretrained=$MODEL,parallelize=True \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install lm-eval==0.4.3

 usage() {
    echo``
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done

 lm_eval --model vllm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
-  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
-  --batch_size "$BATCH_SIZE"
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
 done

 # Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG

 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,4 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
@ -15,7 +14,7 @@ import lm_eval
 import numpy
 import yaml

-RTOL = 0.05
+RTOL = 0.02
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@ -24,12 +23,9 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


 def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-
    model_args = f"pretrained={eval_config['model_name']}," \
                 f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true," \
-                 f"trust_remote_code={trust_remote_code}"
+                 f"add_bos_token=true"

    results = lm_eval.simple_evaluate(
        model="vllm",
@ -50,15 +46,10 @@ def test_lm_eval_correctness():
    results = launch_lm_eval(eval_config)

    # Confirm scores match ground truth.
-    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(f'{task["name"]} | {metric["name"]}: '
                  f'ground_truth={ground_truth} | measured={measured_value}')
-            success = success and numpy.isclose(
-                ground_truth, measured_value, rtol=RTOL)
-
-    # Assert at the end, print all scores even on failure for debugging.
-    assert success
+            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -34,18 +34,17 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan

 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
+- Every commit for those PRs with `perf-benchmarks` label.

 Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
+- Every commit for those PRs with `nightly-benchmarks` label.




 ## Performance benchmark details

-
-See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.


 #### Latency test
@ -69,7 +68,7 @@ Here is an example of one test inside `latency-tests.json`:

 In this example:
 -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`

 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.

--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -1,6 +1,5 @@
 steps:
  - label: "Wait for container to be ready"
-    key: wait-for-container-image
    agents:
      queue: A100
    plugins:
@ -9,21 +8,20 @@ steps:
          containers:
          - image: badouralix/curl-jq
            command:
-            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-
+            - sh
+            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+  - wait
  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
-    depends_on: wait-for-container-image
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
            resources:
              limits:
                nvidia.com/gpu: 8
@ -44,49 +42,20 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
-    depends_on: wait-for-container-image
    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+    - docker#v5.11.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        gpus: all
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
+
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -1,28 +0,0 @@
-
-## Description
-
-This file contains the downloading link for benchmarking results.
-
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
-
-Please download the visualization scripts in the post
-
-
-## Results reproduction
-
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
-  - Download `nightly-benchmarks.zip`. 
-  - In the same folder, run the following code
-```
-export HF_TOKEN=<your HF token>
-apt update
-apt install -y git
-unzip nightly-benchmarks.zip
-VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-```
-
-And the results will be inside `./benchmarks/results`.
-
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,39 +1,45 @@

 # Nightly benchmark

-This benchmark aims to:
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
-
-Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+The main goal of this benchmarking is two-fold:
+- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
+- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().


-## Setup
+## Docker images

- Docker images:
-  - vLLM: `vllm/vllm-openai:v0.6.2`
-  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
-  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
-  - 8x Nvidia A100 GPUs
- Workload:
-  - Dataset
-    - ShareGPT dataset
-    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-  - Models: llama-3 8B, llama-3 70B.
-    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+- vllm/vllm-openai:v0.5.0.post1
+- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+- openmmlab/lmdeploy:v0.5.0
+- ghcr.io/huggingface/text-generation-inference:2.1

-# Known issues
+<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->

- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
+
+## Hardware
+
+One AWS node with 8x NVIDIA A100 GPUs.
+
+
+## Workload description
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+
+- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 500 prompts.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
+
+## Plots
+
+In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+
+<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
+
+## Results
+
+{nightly_results_benchmarking_table}
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec

 common_container_settings: &common_container_settings
  command:
-    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
  resources:
    limits:
      nvidia.com/gpu: 8
@ -37,10 +37,7 @@ common_container_settings: &common_container_settings

 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-
-
-
-  - label: "A100 vllm step 10"
+  - label: "A100 trt benchmark"
    priority: 100
    agents:
      queue: A100
@ -49,21 +46,7 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: vllm/vllm-openai:v0.6.2
-                <<: *common_container_settings
-
-
-
-  - label: "A100 sglang benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: lmsysorg/sglang:v0.3.2-cu121
+              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
                <<: *common_container_settings

  - label: "A100 lmdeploy benchmark"
@ -75,13 +58,11 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: openmmlab/lmdeploy:v0.6.1-cu12
+              - image: openmmlab/lmdeploy:v0.5.0
                <<: *common_container_settings
+  

-
-
-
-  - label: "A100 trt llama-8B"
+  - label: "A100 vllm benchmark"
    priority: 100
    agents:
      queue: A100
@ -90,25 +71,10 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+              - image: vllm/vllm-openai:latest 
                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama8B"

-
-  - label: "A100 trt llama-70B"
+  - label: "A100 tgi benchmark"
    priority: 100
    agents:
      queue: A100
@ -117,54 +83,12 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+              - image: ghcr.io/huggingface/text-generation-inference:2.1 
                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama70B"
-
-
-  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
-  # - label: "A100 trt benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-  #               <<: *common_container_settings
-
-
-  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
-  #               <<: *common_container_settings
        
  - wait

-  - label: "Collect the results"
+  - label: "Plot"
    priority: 100
    agents:
      queue: A100
@ -193,4 +117,4 @@ steps:
                    name: hf-token-secret
                    key: token

-  - block: ":rocket: check the results!"
+  - wait
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -6,7 +6,6 @@

 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
-set -x
 set -o pipefail

 check_gpus() {
@ -18,7 +17,7 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }

@ -35,15 +34,6 @@ check_hf_token() {
  fi
 }

-ensure_sharegpt_downloaded() {
-  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
-  if [ ! -f "$FILE" ]; then
-    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
-  else
-    echo "$FILE already exists."
-  fi
-}
-
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
@ -69,34 +59,40 @@ wait_for_server() {
    done' && return 0 || return 1
 }

-kill_processes_launched_by_current_bash() {
-  # Kill all python processes launched from current bash script
-  current_shell_pid=$$
-  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
-  if [ -n "$processes" ]; then
-    echo "Killing the following processes matching '$1':"
-    echo "$processes"
-    echo "$processes" | xargs kill -9
-  else
-    echo "No processes found matching '$1'."
-  fi
-}
-
 kill_gpu_processes() {
+  # kill all processes on GPU.
+  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
+  if [ -z "$pids" ]; then
+      echo "No GPU processes found."
+  else
+      for pid in $pids; do
+          kill -9 "$pid"
+          echo "Killed process with PID: $pid"
+      done

-  ps -aux
-  lsof -t -i:8000 | xargs -r kill -9
-  pgrep python3 | xargs -r kill -9
+      echo "All GPU processes have been killed."
+  fi

+  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+  # since we are in container anyway
+  pkill -9 -f python
+  pkill -9 -f python3

-  # wait until GPU memory usage smaller than 1GB
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+  # waiting for GPU processes to be fully killed
+  # loop while nvidia-smi returns any processes
+  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
    sleep 1
+    echo "Waiting for GPU processes to be killed"
  done

  # remove vllm config file
  rm -rf ~/.config/vllm

+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }

 upload_to_buildkite() {
@ -114,7 +110,7 @@ upload_to_buildkite() {
  fi

  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

@ -147,7 +143,7 @@ run_latency_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
      continue
    fi

@ -166,7 +162,7 @@ run_latency_tests() {
        latency_command: $latency,
        gpu_type: $gpu
      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
    eval "$latency_command"
@ -176,6 +172,7 @@ run_latency_tests() {
  done
 }

+
 run_throughput_tests() {
  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases
@ -203,9 +200,9 @@ run_throughput_tests() {
    throughput_args=$(json2args "$throughput_params")

    # check if there is enough GPU to run the test
-    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
      continue
    fi

@ -223,7 +220,7 @@ run_throughput_tests() {
        throughput_command: $command,
        gpu_type: $gpu
      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
    eval "$throughput_command"
@ -255,6 +252,7 @@ run_serving_tests() {
      continue
    fi

+
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    client_params=$(echo "$params" | jq -r '.client_parameters')
@ -267,7 +265,7 @@ run_serving_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
      continue
    fi

@ -275,7 +273,7 @@ run_serving_tests() {
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $test_name."
+      echo "Server model and client model must be the same. Skip testcase $testname."
      continue
    fi

@ -286,11 +284,12 @@ run_serving_tests() {
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
-    bash -c "$server_command" &
+    eval "$server_command" &
    server_pid=$!

    # wait until the server is alive
-    if wait_for_server; then
+    wait_for_server
+    if [ $? -eq 0 ]; then
      echo ""
      echo "vllm server is up and running."
    else
@ -319,7 +318,7 @@ run_serving_tests() {
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"

-      bash -c "$client_command"
+      eval "$client_command"

      # record the benchmarking commands
      jq_output=$(jq -n \
@ -331,7 +330,7 @@ run_serving_tests() {
          client_command: $client,
          gpu_type: $gpu
        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"

    done

@ -348,7 +347,6 @@ main() {
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
-  (which lsof) || (apt-get update && apt-get install -y lsof)

  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
@ -357,7 +355,7 @@ main() {

  # prepare for benchmarking
  cd benchmarks || exit 1
-  ensure_sharegpt_downloaded
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
@ -367,6 +365,7 @@ main() {
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json

+
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    if [[ $gpu_count -gt 0 ]]; then
+        echo "GPU found."
+    else
+        echo "Need at least 1 GPU to run benchmarking."
+        exit 1
+    fi
+    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+    echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+    # check if HF_TOKEN is available and valid
+    if [[ -z "$HF_TOKEN" ]]; then
+        echo "Error: HF_TOKEN is not set."
+        exit 1
+    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+        echo "Error: HF_TOKEN does not start with 'hf_'."
+        exit 1
+    else
+        echo "HF_TOKEN is set and valid."
+    fi
+}
+
+main() {
+
+    check_gpus
+    check_hf_token
+
+    df -h
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+    
+
+    # run lmdeploy
+    if which lmdeploy >/dev/null; then
+        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+        exit 0
+    fi
+
+    # run tgi
+    if [ -e /tgi-entrypoint.sh ]; then
+        echo "tgi is available, redirect to run-tgi-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+        exit 0
+    fi
+
+    # run trt
+    if which trtllm-build >/dev/null; then
+        echo "trtllm is available, redirect to run-trt-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+        exit 0
+    fi
+
+    # run vllm
+    if [ -e /vllm-workspace ]; then
+        echo "vllm is available, redirect to run-vllm-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+        exit 0
+    fi
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import json
 import os
 from pathlib import Path
@ -58,7 +56,7 @@ serving_column_mapping = {

 def read_markdown(file):
    if os.path.exists(file):
-        with open(file) as f:
+        with open(file, "r") as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
@ -77,14 +75,14 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):

-        with open(test_file) as f:
+        with open(test_file, "r") as f:
            raw_result = json.loads(f.read())

        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
+            with open(test_file.with_suffix(".commands"), "r") as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -99,7 +97,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
+            with open(test_file.with_suffix(".commands"), "r") as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -121,7 +119,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
+            with open(test_file.with_suffix(".commands"), "r") as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -159,18 +157,6 @@ if __name__ == "__main__":
                                             throughput_results,
                                             serving_results)

-    for df in [latency_results, serving_results, throughput_results]:
-        if df.empty:
-            continue
-
-        # Sort all dataframes by their respective "Test name" columns
-        df.sort_values(by="Test name", inplace=True)
-
-        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
-        # we want to turn it into "8xGPUTYPE"
-        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
-
    # get markdown tables
    latency_md_table = tabulate(latency_results,
                                headers='keys',
@ -188,8 +174,8 @@ if __name__ == "__main__":
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:

-        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-                                "performance-benchmarks-descriptions.md")
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import argparse

 from transformers import AutoTokenizer
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description=
-        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder',
-                        type=str,
-                        required=True,
-                        help='The folder where the results are stored.')
-    parser.add_argument('--description',
-                        type=str,
-                        required=True,
-                        help='Description of the results.')
-
-    args = parser.parse_args()
-    return args
-
-
-def get_perf(df, method, model, metric):
-
-    means = []
-
-    for qps in [2, 4, 8, 16, "inf"]:
-        target = df['Test name'].str.contains(model)
-        target = target & df['Engine'].str.contains(method)
-        target = target & df['Test name'].str.contains("qps_" + str(qps))
-        filtered_df = df[target]
-
-        if filtered_df.empty:
-            means.append(0.)
-        else:
-            means.append(filtered_df[metric].values[0])
-
-    return np.array(means)
-
-
-def get_perf_w_std(df, method, model, metric):
-
-    if metric in ["TTFT", "ITL"]:
-        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
-        mean = mean.tolist()
-        std = get_perf(df, method, model, "Std " + metric + " (ms)")
-        if std.mean() == 0:
-            std = None
-        success = get_perf(df, method, model, "Successful req.")
-        if std is not None:
-            std = std / np.sqrt(success)
-            std = std.tolist()
-
-    else:
-        assert metric == "Tput"
-        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)")
-        mean = mean.tolist()
-        std = None
-
-    return mean, std
-
-
-def main(args):
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file) as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-
-    with open(args.description) as f:
-        description = f.read()
-
-    description = description.format(
-        nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 from lmdeploy.serve.openai.api_client import APIClient

 api_client = APIClient("http://localhost:8000")
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -1,228 +0,0 @@
-#!/bin/bash
-
-# Currently FP8 benchmark is NOT enabled.
-
-set -x
-server_params=$1
-common_params=$2
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-launch_trt_server() {
-
-  model_path=$(echo "$common_params" | jq -r '.model')
-  model_name="${model_path#*/}"
-  model_type=$(echo "$server_params" | jq -r '.model_type')
-  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
-  model_tp_size=$(echo "$common_params" | jq -r '.tp')
-  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
-  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
-  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
-  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
-  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
-
-  # create model caching directory
-  cd ~
-  rm -rf models
-  mkdir -p models
-  cd models
-  models_dir=$(pwd)
-  trt_model_path=${models_dir}/${model_name}-trt-ckpt
-  trt_engine_path=${models_dir}/${model_name}-trt-engine
-
-  # clone tensorrt backend
-  cd /
-  rm -rf tensorrtllm_backend
-  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-  git lfs install
-  cd tensorrtllm_backend
-  git checkout "$trt_llm_version"
-  git submodule update --init --recursive
-
-  # build trtllm engine
-  cd /tensorrtllm_backend
-  cd "./tensorrt_llm/examples/${model_type}"
-  python3 convert_checkpoint.py \
-    --model_dir "${model_path}" \
-    --dtype "${model_dtype}" \
-    --tp_size "${model_tp_size}" \
-    --output_dir "${trt_model_path}"
-  trtllm-build \
-    --checkpoint_dir "${trt_model_path}" \
-    --use_fused_mlp \
-    --reduce_fusion disable \
-    --workers 8 \
-    --gpt_attention_plugin "${model_dtype}" \
-    --gemm_plugin "${model_dtype}" \
-    --tp_size "${model_tp_size}" \
-    --max_batch_size "${max_batch_size}" \
-    --max_input_len "${max_input_len}" \
-    --max_seq_len "${max_seq_len}" \
-    --max_num_tokens "${max_num_tokens}" \
-    --output_dir "${trt_engine_path}"
-
-  # handle triton protobuf files and launch triton server
-  cd /tensorrtllm_backend
-  mkdir triton_model_repo
-  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
-  cd triton_model_repo
-  rm -rf ./tensorrt_llm/1/*
-  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
-  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
-  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
-  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
-  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
-  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
-  cd /tensorrtllm_backend
-  python3 scripts/launch_triton_server.py \
-    --world_size="${model_tp_size}" \
-    --model_repo=/tensorrtllm_backend/triton_model_repo &
-
-}
-
-launch_tgi_server() {
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  port=$(echo "$common_params" | jq -r '.port')
-  server_args=$(json2args "$server_params")
-
-  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
-    echo "Key 'fp8' exists in common params."
-    server_command="/tgi-entrypoint.sh \
-                --model-id $model \
-                --num-shard $tp \
-                --port $port \
-                --quantize fp8 \
-                $server_args"
-  else
-    echo "Key 'fp8' does not exist in common params."
-    server_command="/tgi-entrypoint.sh \
-                --model-id $model \
-                --num-shard $tp \
-                --port $port \
-                $server_args"
-  fi
-
-  echo "Server command: $server_command"
-  eval "$server_command" &
-
-}
-
-launch_lmdeploy_server() {
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  port=$(echo "$common_params" | jq -r '.port')
-  server_args=$(json2args "$server_params")
-
-  server_command="lmdeploy serve api_server $model \
-    --tp $tp \
-    --server-port $port \
-    $server_args"
-
-  # run the server
-  echo "Server command: $server_command"
-  bash -c "$server_command" &
-}
-
-launch_sglang_server() {
-
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  port=$(echo "$common_params" | jq -r '.port')
-  server_args=$(json2args "$server_params")
-
-  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
-    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
-    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m sglang.launch_server \
-        --tp $tp \
-        --model-path $model \
-        --port $port \
-        $server_args"
-  else
-    echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m sglang.launch_server \
-        --tp $tp \
-        --model-path $model \
-        --port $port \
-        $server_args"
-  fi
-
-  # run the server
-  echo "Server command: $server_command"
-  eval "$server_command" &
-}
-
-launch_vllm_server() {
-
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  port=$(echo "$common_params" | jq -r '.port')
-  server_args=$(json2args "$server_params")
-
-  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
-    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
-    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-  else
-    echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-  fi
-
-  # run the server
-  echo "Server command: $server_command"
-  eval "$server_command" &
-}
-
-main() {
-
-  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
-    launch_trt_server
-  fi
-
-  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
-    launch_tgi_server
-  fi
-
-  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
-    launch_lmdeploy_server
-  fi
-
-  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
-    launch_sglang_server
-  fi
-
-  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
-    launch_vllm_server
-  fi
-}
-
-main
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@ -0,0 +1,102 @@
+#!/bin/bash
+
+
+server_params=$1
+common_params=$2
+
+
+
+model_path=$(echo "$common_params" | jq -r '.model')
+model_name="${model_path#*/}"
+model_type=$(echo "$server_params" | jq -r '.model_type')
+model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+model_tp_size=$(echo "$common_params" | jq -r '.tp')
+max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
+trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+cd ~
+rm -rf models
+mkdir -p models
+cd models
+models_dir=$(pwd)
+trt_model_path=${models_dir}/${model_name}-trt-ckpt
+trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+cd ~
+rm -rf tensorrt-demo
+git clone https://github.com/neuralmagic/tensorrt-demo.git
+cd tensorrt-demo
+tensorrt_demo_dir=$(pwd)
+
+# make sure the parameter inside tensorrt_demo is consistent to envvar
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+cd /
+rm -rf tensorrtllm_backend
+git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+git lfs install
+cd tensorrtllm_backend
+git checkout $trt_llm_version
+tensorrtllm_backend_dir=$(pwd)
+git submodule update --init --recursive
+cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+cd /tensorrtllm_backend
+cd ./tensorrt_llm/examples/${model_type}
+
+
+if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+
+    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
+    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
+    python ../quantization/quantize.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path} \
+        --qformat fp8 \
+        --kv_cache_dtype fp8 \
+        --calib_size 2
+
+else
+
+    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
+    python3 convert_checkpoint.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path}
+
+fi
+
+
+
+trtllm-build \
+--checkpoint_dir=${trt_model_path} \
+--gpt_attention_plugin=${model_dtype} \
+--gemm_plugin=${model_dtype} \
+--remove_input_padding=enable \
+--paged_kv_cache=enable \
+--tp_size=${model_tp_size} \
+--max_batch_size=${max_batch_size} \
+--max_input_len=${max_input_len} \
+--max_output_len=${max_output_len} \
+--max_num_tokens=${max_output_len} \
+--opt_num_tokens=${max_output_len} \
+--output_dir=${trt_engine_path} 
+
+cd /tensorrtllm_backend/triton_model_repo
+rm -rf ./tensorrt_llm/1/*
+cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+cd /tensorrtllm_backend
+python3 scripts/launch_triton_server.py \
+--world_size=${model_tp_size} \
+--model_repo=/tensorrtllm_backend/triton_model_repo &
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -8,7 +8,6 @@ main() {

    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
-    (which zip) || (apt-get install -y zip)

    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
@ -16,63 +15,26 @@ main() {
    fi

    # initial annotation
-    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"

    # download results
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/

-    # upload benchmark results
-    zip -r results.zip results/
-    /workspace/buildkite-agent artifact upload "results.zip"
-
-    # upload benchmarking scripts
-    cd "$VLLM_SOURCE_CODE_LOC/"
-    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
-    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    # upload benchmarking pipeline
-    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
+    # generate figures
+    python3 -m pip install tabulate pandas matplotlib
+    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+        --description $description \
+        --results-folder results/
    
-
-
-    # The figures should be generated by a separate process outside the CI/CD pipeline
-
-    # # generate figures
-    # python3 -m pip install tabulate pandas matplotlib
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
-    #     --description $description \
-    #     --results-folder results/ 
-
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sharegpt
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_2048_128
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_128_2048
-    
-    # # upload results and figures
-    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+    # upload results and figures
+    /workspace/buildkite-agent artifact upload "nightly_results.png"
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }

-main "$@"
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@ -0,0 +1,135 @@
+import argparse
+import json
+import math
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file, "r") as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description, "r") as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+    plt.rcParams.update({'font.size': 20})
+
+    # plot results
+    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
+    fig.subplots_adjust(hspace=1)
+    methods = ["vllm", "trt", "lmdeploy", "tgi"]
+    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
+        for j, metric in enumerate(["TTFT", "ITL"]):
+            means, stds = [], []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    means.append(0.)
+                    stds.append(0.)
+                else:
+                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
+                    std = filtered_df[f"Std {metric} (ms)"].values[0]
+                    success = filtered_df["Successful req."].values[0]
+                    stds.append(std / math.sqrt(success))
+
+            print(model, metric)
+            print(means, stds)
+
+            ax = axes[i, j + 1]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                means,
+                yerr=stds,
+                capsize=10,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel(f"{metric} (ms)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+        metric = "Tput"
+        j = 0
+        if True:
+            tputs = []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    tputs.append(0.)
+                else:
+                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
+                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
+                    tputs.append(input_tput + output_tput)
+
+            print(model, metric)
+            print(tputs)
+
+            ax = axes[i, j]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                tputs,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel("Tput (token/s)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+    fig.tight_layout()
+    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@ -0,0 +1,218 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill lmdeploy || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # append lmdeploy to the test name
+    test_name=lmdeploy_$test_name
+    
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
+    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # prepare tokenizer
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+
+    server_command="lmdeploy serve api_server $model \
+      --tp $tp \
+      --server-port $port \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "lmdeploy server is up and running."
+    else
+      echo ""
+      echo "lmdeploy failed to start within the timeout period."
+      break
+    fi
+
+    # get model name
+    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend lmdeploy \
+        --tokenizer /tokenizer_cache \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --model \"$model_name\" \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "lmdeploy" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  python -m pip install transformers==4.41.2
+
+  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -1,462 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-get_current_llm_serving_engine() {
-
-  if which lmdeploy >/dev/null; then
-    echo "Container: lmdeploy"
-    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-    return
-  fi
-
-  if [ -e /tgi-entrypoint.sh ]; then
-    echo "Container: tgi"
-    export CURRENT_LLM_SERVING_ENGINE=tgi
-    return
-  fi
-
-  if which trtllm-build >/dev/null; then
-    echo "Container: tensorrt-llm"
-    export CURRENT_LLM_SERVING_ENGINE=trt
-    return
-  fi
-
-  if [ -e /sgl-workspace ]; then
-    echo "Container: sglang"
-    export CURRENT_LLM_SERVING_ENGINE=sglang
-    return
-  fi
-
-  if [ -e /vllm-workspace ]; then
-    echo "Container: vllm"
-    # move to a completely irrelevant directory, to avoid import vllm from current folder
-    export CURRENT_LLM_SERVING_ENGINE=vllm
-    
-    return
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-kill_gpu_processes() {
-  pkill -f python
-  pkill -f python3
-  pkill -f tritonserver
-  pkill -f pt_main_thread
-  pkill -f text-generation
-  pkill -f lmdeploy
-
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-ensure_installed() {
-  # Ensure that the given command is installed by apt-get
-  local cmd=$1
-  if ! which "$cmd" >/dev/null; then
-    apt-get update && apt-get install -y "$cmd"
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    # this is required for lmdeploy.
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-
-
-    # change model name for lmdeploy (it will not follow standard hf name)
-    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
-      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ $backend = "trt" ]]; then
-        backend="tensorrt-llm"
-      fi
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-
-      if [[ "$dataset_name" = "sharegpt" ]]; then
-
-        client_command="python3 benchmark_serving.py \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      elif [[ "$dataset_name" = "sonnet" ]]; then
-
-        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
-        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
-        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-
-        client_command="python3 benchmark_serving.py \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --sonnet-input-len $sonnet_input_len \
-          --sonnet-output-len $sonnet_output_len \
-          --sonnet-prefix-len $sonnet_prefix_len \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      else
-  
-        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
-        exit 1
-
-      fi
-
-        
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command="None"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-  done
-
-  kill_gpu_processes
-}
-
-run_genai_perf_tests() {
-  # run genai-perf tests 
-
-  # $1: a json file specifying genai-perf test cases
-  local genai_perf_test_file
-  genai_perf_test_file=$1
-
-  # Iterate over genai-perf tests
-  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')    
-    
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-    
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps=$num_prompts
-        echo "now qps is $qps"
-      fi
-    
-      new_test_name=$test_name"_qps_"$qps
-      backend=$CURRENT_LLM_SERVING_ENGINE
-      
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-      #TODO: add output dir.
-      client_command="genai-perf profile \
-        -m $model \
-        --service-kind openai \
-        --backend vllm \
-        --endpoint-type chat \
-        --streaming \
-        --url localhost:$port \
-        --request-rate $qps \
-        --num-prompts $num_prompts \
-      "
-
-    echo "Client command: $client_command"
-
-    eval "$client_command"
-
-    #TODO: process/record outputs
-    done
-  done
-
-  kill_gpu_processes
-
-}
-
-prepare_dataset() {
-
-  # download sharegpt dataset
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-  
-}
-
-main() {
-
-  # check if the environment variable is successfully injected from yaml
-
-  check_gpus
-  check_hf_token
-  get_current_llm_serving_engine
-
-  pip install -U transformers
-
-  pip install -r requirements-dev.txt
-  which genai-perf
-
-  # check storage
-  df -h
-
-  ensure_installed wget
-  ensure_installed curl
-  ensure_installed jq
-  # genai-perf dependency
-  ensure_installed libb64-0d
-
-  prepare_dataset
-
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-
-  # run the test
-  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
-
-  # run genai-perf tests
-  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
-  mv artifacts/ $RESULTS_FOLDER/
-
-  # upload benchmark results to buildkite
-  python3 -m pip install tabulate pandas
-  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@ -0,0 +1,216 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill text-generation || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append tgi to the test name
+    test_name=tgi_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
+    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        --quantize fp8 \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        $server_args"
+    fi
+
+
+    
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "tgi server is up and running."
+    else
+      echo ""
+      echo "tgi failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tgi \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "tgi" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=tgi
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill tritonserver || true
+  # waiting for GPU processes to be fully killed
+  sleep 20
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append trt to the test name
+    test_name=trt_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
+    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+
+    echo "Running test case $test_name"
+    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "trt server is up and running."
+    else
+      echo ""
+      echo "trt failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tensorrt-llm \
+        --tokenizer /tokenizer_cache \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command=""
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "trt" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+
+
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # update transformers package, to make sure mixtral tokenizer is available
+  python -m pip install transformers -U
+
+  export CURRENT_LLM_SERVING_ENGINE=trt
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@ -0,0 +1,221 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append vllm to the test name
+    test_name=vllm_$test_name
+
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
+    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    fi
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend vllm \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "vllm" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=vllm
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+
+  python3 -m pip install tabulate pandas
+  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import datetime
 import json
 import os
@ -19,17 +17,10 @@ serving_column_mapping = {
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "std_tpot_ms": "Std TPOT (ms)",
-    "median_tpot_ms": "Median TPOT (ms)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
+    "input_throughput": "Input Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }

@ -38,11 +29,11 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):

-        with open(test_file) as f:
+        with open(test_file, "r") as f:
            raw_result = json.loads(f.read())

        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands")) as f:
+        with open(test_file.with_suffix(".commands"), "r") as f:
            command = json.loads(f.read())
        raw_result.update(command)

--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,12 +1,10 @@
 #!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
-
-TIMEOUT_SECONDS=10
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"

 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
+    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
        exit 0
    fi

@ -16,4 +14,4 @@ while [ $retries -lt 1000 ]; do
    sleep 5
 done

-exit 1
+exit 1
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -1,42 +1,47 @@

 ## Latency tests

+This test suite aims to test vllm's end-to-end latency under a controlled setup.
+
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).

+### Latency benchmarking results

 {latency_tests_markdown_table}

-
 ## Throughput tests

+This test suite aims to test vllm's throughput.
+
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.

+### Throughput benchmarking results

 {throughput_tests_markdown_table}

-
 ## Serving tests

+This test suite aims to test vllm's real serving metrics.
+
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B, under QPS 2
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).

+### Serving benchmarking results

 {serving_tests_markdown_table}

-
 ## json version of the benchmarking tables

 This section contains the data of the markdown tables above in JSON format. 
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@ -1,23 +0,0 @@
-[
-    {
-        "test_name": "llama8B_tp1_genai_perf",
-        "qps_list": [4,8,16,32],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "tp": 1,
-            "port": 8000,
-            "num_prompts": 500,
-            "reuse_server": false
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "genai_perf_input_parameters": {
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -2,7 +2,7 @@
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
@ -12,7 +12,7 @@
    {
        "test_name": "latency_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num-iters-warmup": 5,
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -1,18 +1,16 @@
 [
    {
-        "test_name": "llama8B_tp1_sharegpt",
-        "qps_list": [4,8,16,32,"inf"],
+        "test_name": "llama8B_tp1",
+        "qps_list": [4],
        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000,
-            "reuse_server": false
+            "port": 8000
        },
        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -23,158 +21,34 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
+            "model_dtype": "float16",
+            "max_batch_size": 256,
            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
+        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
+            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "llama8B_tp1_sonnet_512_16",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "tp": 1,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 16,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama8B_tp1_sonnet_512_256",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "tp": 1,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 256,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4_sharegpt",
-        "qps_list": [4,8,16,32,"inf"],
+        "test_name": "llama70B_tp4",
+        "qps_list": [2],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000,
-            "reuse_server": false
+            "port": 8000
        },
        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -185,50 +59,34 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
+            "model_dtype": "float16",
+            "max_batch_size": 256,
            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
+        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
+            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "llama70B_tp4_sonnet_512_16",
-        "qps_list": [4,8,16,32,"inf"],
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2],
        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 16,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
+            "port": 8000
        },
        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -239,85 +97,20 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
+            "model_dtype": "float16",
+            "max_batch_size": 256,
            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
+        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
+            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4_sonnet_512_256",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 256,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -3,7 +3,7 @@
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
@ -11,7 +11,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -22,7 +22,7 @@
        "test_name": "serving_llama70B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
@ -30,7 +30,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -55,26 +55,5 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
-        "qps_list": [2],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "disable_log_requests": "", 
-            "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1,
-            "use_v2_block_manager": ""
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200 
-        }
    }
-]
+]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -2,7 +2,7 @@
    {
        "test_name": "throughput_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -13,7 +13,7 @@
    {
        "test_name": "throughput_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,77 +1,19 @@
 steps:
-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
    agents:
-      queue: cpu_queue_postmerge
+      queue: cpu_queue
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-  # However, this block can be uncommented to save some compute hours.
-  # - block: "Build CUDA 11.8 wheel"
-  #   key: block-build-cu118-wheel
-
-  - label: "Build wheel - CUDA 11.8"
-    # depends_on: block-build-cu118-wheel
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - block: "Build release image"
-    depends_on: ~
-    key: block-release-image-build
-
-  - label: "Build release image"
-    depends_on: block-release-image-build
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-  - label: "Build and publish TPU release image"
-    depends_on: ~
-    if: build.env("NIGHTLY") == "1"
-    agents:
-      queue: tpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
-      - "docker push vllm/vllm-tpu:nightly"
-      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllm
-          password-env: DOCKERHUB_TOKEN
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - input: "Provide Release version here"
-    fields:
-      - text: "What is the release version?"
-        key: "release-version"
-
-  - block: "Build CPU release image"
-    key: block-cpu-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish CPU release image"
-    depends_on: block-cpu-release-image-build
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+      # rename the files to change linux -> manylinux1
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
    env:
      DOCKER_BUILDKIT: "1"
+    matrix:
+      setup:
+        cuda_version:
+          - "11.8.0"
+          - "12.1.0"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -1,7 +1,5 @@
-#!/bin/bash
-
 # This script runs test inside the corresponding ROCm docker container.
-set -o pipefail
+set -ex

 # Print ROCm version
 echo "--- Confirming Clean Initial State"
@ -33,8 +31,8 @@ cleanup_docker() {
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
-    # Remove unused volumes / force the system prune for old images as well.
-    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    # Remove unused volumes
+    docker volume prune -f
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
@ -57,100 +55,31 @@ while true; do
 done

 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
+image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull "${image_name}"
+docker pull ${image_name}

 remove_docker_container() {
-   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
 }
 trap remove_docker_container EXIT

 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
+mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"

-commands=$@
-echo "Commands:$commands"
-#ignore certain kernels tests
-if [[ $commands == *" kernels "* ]]; then
-  commands="${commands} \
-  --ignore=kernels/test_attention.py \
-  --ignore=kernels/test_attention_selector.py \
-  --ignore=kernels/test_blocksparse_attention.py \
-  --ignore=kernels/test_causal_conv1d.py \
-  --ignore=kernels/test_cutlass.py \
-  --ignore=kernels/test_encoder_decoder_attn.py \
-  --ignore=kernels/test_flash_attn.py \
-  --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_int8_quant.py \
-  --ignore=kernels/test_machete_gemm.py \
-  --ignore=kernels/test_mamba_ssm.py \
-  --ignore=kernels/test_marlin_gemm.py \
-  --ignore=kernels/test_moe.py \
-  --ignore=kernels/test_prefix_prefill.py \
-  --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py"
-fi
-
-#ignore certain Entrypoints tests
-if [[ $commands == *" entrypoints/openai "* ]]; then
-  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
-  --ignore=entrypoints/openai/test_accuracy.py \
-  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_encoder_decoder.py \
-  --ignore=entrypoints/openai/test_embedding.py \
-  --ignore=entrypoints/openai/test_oot_registration.py "}
-fi
-
-PARALLEL_JOB_COUNT=8
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
-if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used   
-  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    # assign shard-id for each shard
-    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
-    echo "Shard ${GPU} commands:$commands_gpu"
-    docker run \
+docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
-        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
-        -v "${HF_CACHE}:${HF_MOUNT}" \
-        -e "HF_HOME=${HF_MOUNT}" \
-        --name "${container_name}_${GPU}" \
-        "${image_name}" \
-        /bin/bash -c "${commands_gpu}" \
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
-    PIDS+=($!)
-  done
-  #wait for all processes to finish and collect exit codes
-  for pid in "${PIDS[@]}"; do
-    wait "${pid}"
-    STATUS+=($?)
-  done
-  for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]]; then
-      echo "One of the processes failed with $st"
-      exit "${st}"
-    fi
-  done
-else
-  docker run \
-          --device /dev/kfd --device /dev/dri \
-          --network host \
-          --shm-size=16gb \
-          --rm \
-          -e HIP_VISIBLE_DEVICES=0 \
-          -e HF_TOKEN \
-          -v "${HF_CACHE}:${HF_MOUNT}" \
-          -e "HF_HOME=${HF_MOUNT}" \
-          --name "${container_name}" \
-          "${image_name}" \
-          /bin/bash -c "${commands}"
-fi
+        -v ${HF_CACHE}:${HF_MOUNT} \
+        -e HF_HOME=${HF_MOUNT} \
+        --name ${container_name} \
+        ${image_name} \
+        /bin/bash -c "${@}"
+
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -1,5 +1,3 @@
-#!/bin/bash
-
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite

 set -ex
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@ -1,14 +0,0 @@
-#!/bin/bash
-
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
-
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -1,88 +1,40 @@
-#!/bin/bash
-
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex

-# allow to bind to different cores
-CORE_RANGE=${CORE_RANGE:-48-95}
-NUMA_NODE=${NUMA_NODE:-1}
-
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .

 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2

-function cpu_tests() {
-  set -e
-  export NUMA_NODE=$2
+# offline inference
+docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"

-  # offline inference
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
-    set -e
-    python3 examples/offline_inference/basic.py"
+# Run basic model test
+docker exec cpu-test bash -c "
+  pip install pytest Pillow protobuf
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported

-  # Run basic model test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-    set -e
-    pip install -r vllm/requirements-test.txt
-    pytest -v -s tests/models/decoder_only/language -m cpu_model
-    pytest -v -s tests/models/embedding/language -m cpu_model
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
-    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
-
-  # Run compressed-tensor test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
-
-  # Run AWQ test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -s -v \
-    tests/quantization/test_ipex_quant.py"
-
-  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -s -v -k cpu_model \
-    tests/basic_correctness/test_chunked_prefill.py"  
-
-  # online serving
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-    set -e
-    export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$1
-    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
-    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-    python3 benchmarks/benchmark_serving.py \
-      --backend vllm \
-      --dataset-name random \
-      --model facebook/opt-125m \
-      --num-prompts 20 \
-      --endpoint /v1/completions \
-      --tokenizer facebook/opt-125m"
-
-  # Run multi-lora tests
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -s -v \
-    tests/lora/test_qwen2vl.py"
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+# online inference
+docker exec cpu-test bash -c "
+  export VLLM_CPU_KVCACHE_SPACE=10 
+  export VLLM_CPU_OMP_THREADS_BIND=48-92 
+  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+  python3 benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --dataset-name random \
+    --model facebook/opt-125m \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-
-# This script build the GH200 docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
-python3 use_existing_torch.py
-
-# Try building the docker image
-DOCKER_BUILDKIT=1 docker build . \
-  --target vllm-openai \
-  --platform "linux/arm64" \
-  -t gh200-test \
-  --build-arg max_jobs=66 \
-  --build-arg nvcc_threads=2 \
-  --build-arg torch_cuda_arch_list="9.0+PTX" \
-  --build-arg vllm_fa_cmake_gpu_arches="90-real"
-
-# Setup cleanup
-remove_docker_container() { docker rm -f gh200-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and test offline inference
-docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
-'
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@ -1,24 +0,0 @@
-#!/bin/bash
-
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t hpu-test-env -f Dockerfile.hpu .
-
-# Setup cleanup
-# certain versions of HPU software stack have a bug that can
-# override the exit code of the script, so we need to use
-# separate remove_docker_container and remove_docker_container_and_exit
-# functions, while other platforms only need one remove_docker_container
-# function.
-EXITCODE=1
-remove_docker_container() { docker rm -f hpu-test || true; }
-remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
-trap remove_docker_container_and_exit EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
-EXITCODE=$?
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@ -14,7 +14,7 @@ DOCKER_IMAGE=$4

 shift 4
 COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
+if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
@ -23,7 +23,7 @@ fi

 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
-    echo "$command"
+    echo $command
 done

 start_network() {
@ -36,7 +36,7 @@ start_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
@ -49,20 +49,17 @@ start_nodes() {
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
-            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
-            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
-            /bin/bash -c "tail -f /dev/null"
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"

        # organize containers into a ray cluster
-        if [ "$node" -eq 0 ]; then
+        if [ $node -eq 0 ]; then
            # start the ray head node
-            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
+            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
-            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
        fi
    done

@ -82,22 +79,22 @@ run_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ "$node" -ne 0 ]; then
-            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        if [ $node -ne 0 ]; then
+            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        else
-            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        fi
    done
 }
 cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
-        docker stop "node$node"
+        docker stop node$node
    done
    docker network rm docker-net
 }
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -1,20 +1,6 @@
-#!/bin/bash
-
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"

 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@ -25,33 +11,41 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
-        # Remove dangling images (those that are not tagged and not used by any container)
-        docker image prune -f
-        # Remove unused volumes / force the system prune for old images as well.
-        docker volume prune -f && docker system prune -f
-        # Remove huggingface model artifacts and compiler cache
-        rm -rf "${HF_MOUNT:?}/*"
-        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
-        echo "$current_time" > /tmp/neuron-docker-build-timestamp
+        docker system prune -f
+        echo $current_time > /tmp/neuron-docker-build-timestamp
    fi
 else
-    date "+%s" > /tmp/neuron-docker-build-timestamp
+    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t "${image_name}" -f Dockerfile.neuron .
+docker build -t neuron -f Dockerfile.neuron .

 # Setup cleanup
-remove_docker_container() {
-    docker image rm -f "${image_name}" || true;
-}
+remove_docker_container() { docker rm -f neuron || true; }
 trap remove_docker_container EXIT
+remove_docker_container

 # Run the image
-docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-       -v "${HF_CACHE}:${HF_MOUNT}" \
-       -e "HF_HOME=${HF_MOUNT}" \
-       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
-       --name "${container_name}" \
-       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+    timeout=300
+    counter=0
+
+    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+        sleep 1
+        counter=$((counter + 1))
+        if [ $counter -ge $timeout ]; then
+            echo "Timeout after $timeout seconds"
+            break
+        fi
+    done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+    localhost:8000/generate \
+    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,5 +1,3 @@
-#!/bin/bash
-
 # This script build the OpenVINO docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -13,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -1,5 +1,3 @@
-#!/bin/bash
-
 set -e

 # Build the docker image.
@ -14,13 +12,5 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
+    python3 /workspace/vllm/examples/offline_inference_tpu.py
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -1,5 +1,3 @@
-#!/bin/bash
-
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -12,8 +10,5 @@ remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Run the image and test offline inference/tensor parallel
-docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/basic.py
-    python3 examples/offline_inference/cli.py -tp 2
-'
+# Run the image and launch offline inference
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -5,615 +5,286 @@
 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
 # to generate the final pipeline yaml file.

-# Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
-# command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
-#     in this case, commands must be specified. the first command runs on first host, the second
-#     command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
-
-# When adding a test
-# - If the test belong to an existing group, add it there
-# - If the test is short, add to any existing step
-# - If the test takes more than 10min, then it is okay to create a new step. 
-#   Note that all steps execute in parallel. 

 steps:
-##### fast check tests  #####
+- label: Async Engine, Inputs, Utils, Worker Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - pytest -v -s async_engine # Async Engine
+  - pytest -v -s test_inputs.py
+  - pytest -v -s multimodal
+  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s worker # Worker

- label: Documentation Build # 2min
+- label: Metrics, Tracing Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - pytest -v -s metrics # Metrics
+  - "pip install \
+      opentelemetry-sdk \
+      opentelemetry-api \
+      opentelemetry-exporter-otlp \
+      opentelemetry-semantic-conventions-ai" # Tracing
+  - pytest -v -s tracing
+
+- label: Regression Test
+  mirror_hardwares: [amd]
+  fast_check: true
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s async_engine
+
+- label: Basic Correctness Test
+  mirror_hardwares: [amd]
+  fast_check: true
+  commands:
+  # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Core Test
+  mirror_hardwares: [amd]
+  fast_check: true
+  commands:
+  - pytest -v -s core
+  - pytest -v -s distributed/test_parallel_state.py
+
+- label: Distributed Comm Ops Test
+  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
+- label: Distributed Tests (2 GPUs)
+  mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  commands:
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
+
+- label: Distributed Tests (4 GPUs)
+  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  fast_check: true
+  commands:
+  - pytest -v -s distributed/test_pynccl.py
+  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
+  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: Engine Test
+  mirror_hardwares: [amd]
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
+
+- label: Entrypoints Test
+  fast_check: true
+  mirror_hardwares: [amd]
+
+  commands:
+  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/openai
+
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
+  commands:
+    # install aws cli for llava_example.py
+    # install tensorizer for tensorize_vllm_model.py
+    - pip install awscli tensorizer
+    - python3 offline_inference.py
+    - python3 cpu_offload.py
+    - python3 offline_inference_with_prefix.py
+    - python3 llm_engine_example.py
+    - python3 llava_example.py
+    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+
+- label: Inputs Test
+  #mirror_hardwares: [amd]
+  commands:
+    - pytest -v -s test_inputs.py
+    - pytest -v -s multimodal
+
+- label: Kernels Test %N
+  #mirror_hardwares: [amd]
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4
+
+- label: Models Test
+  #mirror_hardwares: [amd]
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s models -m \"not vlm\"
+
+- label: Vision Language Models Test
+  mirror_hardwares: [amd]
+  commands:
+    - pytest -v -s models -m vlm
+
+- label: Prefix Caching Test
+  mirror_hardwares: [amd]
+  commands:
+    - pytest -v -s prefix_caching
+
+- label: Samplers Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s samplers
+
+- label: LogitsProcessor Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s test_logits_processor.py
+
+- label: Utils Test
+  commands:
+    - pytest -v -s test_utils.py
+    - pytest -v -s test_embedded_commit.py
+
+- label: Worker Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s worker
+
+- label: Speculative decoding tests
+  #mirror_hardwares: [amd]
+  commands:
+    # See https://github.com/vllm-project/vllm/issues/5152
+    - export VLLM_ATTENTION_BACKEND=XFORMERS
+    - pytest -v -s spec_decode
+
+- label: LoRA Test %N
+  #mirror_hardwares: [amd]
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  parallelism: 4
+
+- label: LoRA Long Context (Distributed)
+  #mirror_hardwares: [amd]
+  num_gpus: 4
+  # This test runs llama 13B, so it is required to run on 4 GPUs.
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s -x lora/test_long_context.py
+
+- label: Tensorizer Test
+  #mirror_hardwares: [amd]
+  soft_fail: true
+  fast_check: true
+  commands:
+    - apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
+
+- label: Metrics Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s metrics
+
+- label: Quantization Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s quantization
+
+- label: Tracing Test
+  commands: 
+    - "pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai"
+    - pytest -v -s tracing
+
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
+
+- label: LM Eval Small Models
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+
+- label: LM Eval Large Models
+  gpu: a100
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
+- label: Documentation Build
  working_dir: "/vllm-workspace/test_docs/docs"
  fast_check: true
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
-  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/inference_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 24min
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/mq_llm_engine
-  - tests/async_engine
-  - tests/test_inputs
-  - tests/multimodal
-  - tests/test_utils
-  - tests/worker
-  - tests/standalone_tests/lazy_imports.py
-  commands:
-  - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s mq_llm_engine # MQLLMEngine
-  - pytest -v -s async_engine # AsyncLLMEngine
-  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
-  - pytest -v -s test_inputs.py
-  - pytest -v -s multimodal
-  - pytest -v -s test_utils.py # Utils
-  - pytest -v -s worker # Worker
-
- label: Python-only Installation Test
-  source_file_dependencies:
-  - tests/standalone_tests/python_only_compile.sh
-  - setup.py
-  commands:
-  - bash standalone_tests/python_only_compile.sh
-
- label: Basic Correctness Test # 30min
-  #mirror_hardwares: [amd]
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_basic_correctness
-  - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_preemption
-  - tests/basic_correctness/test_cumem.py
-  commands:
-  - pytest -v -s basic_correctness/test_cumem.py
-  - pytest -v -s basic_correctness/test_basic_correctness.py
-  - pytest -v -s basic_correctness/test_cpu_offload.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
- label: Chunked Prefill Test
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_chunked_prefill
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
- label: Core Test # 10min
-  mirror_hardwares: [amd]
-  fast_check: true
-  source_file_dependencies:
-  - vllm/core
-  - vllm/distributed
-  - tests/core
-  commands:
-  - pytest -v -s core
-
- label: Entrypoints Test # 40min
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  commands:
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-
- label: Distributed Tests (4 GPUs) # 10min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  fast_check: true
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/core/
-  - tests/distributed
-  - tests/spec_decode/e2e/test_integration_dist_tp4
-  - tests/compile
-  - examples/offline_inference/rlhf.py
-  - examples/offline_inference/ray_placement.py
-  commands:
-  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s distributed/test_pynccl.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-  # TODO: create a dedicated test section for multi-GPU example tests
-  # when we have multiple distributed example tests
-  - python3 ../examples/offline_inference/rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
-
- label: Metrics, Tracing Test # 10min
-  num_gpus: 2 
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/metrics
-  - tests/tracing
-  commands:
-  - pytest -v -s metrics 
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0,<1.27.0' \
-      'opentelemetry-api>=1.26.0,<1.27.0' \
-      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
-  - pytest -v -s tracing
-
-##### fast check tests  #####
-#####  1 GPU test  #####
-
- label: Regression Test # 5min
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/test_regression
-  commands:
-  - pip install modelscope
-  - pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
- label: Engine Test # 10min
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/engine
-  - tests/tokenization
-  commands:
-  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
-
- label: V1 Test
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    # split the test to avoid interference
-    - VLLM_USE_V1=1 pytest -v -s v1/core
-    - VLLM_USE_V1=1 pytest -v -s v1/engine
-    - VLLM_USE_V1=1 pytest -v -s v1/sample
-    - VLLM_USE_V1=1 pytest -v -s v1/worker
-    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
-    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - VLLM_USE_V1=1 pytest -v -s v1/e2e
-
- label: Examples Test # 25min
-  working_dir: "/vllm-workspace/examples"
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/entrypoints
-  - examples/
-  commands:
-    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic.py
-    - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/vision_language.py
-    - python3 offline_inference/vision_language_multi_image.py
-    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder.py
-    - python3 offline_inference/classification.py
-    - python3 offline_inference/embedding.py
-    - python3 offline_inference/scoring.py
-    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
-
- label: Prefix Caching Test # 9min
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/prefix_caching
-  commands:
-    - pytest -v -s prefix_caching
-
- label: Samplers Test # 36min
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/sampling_metadata.py
-  - tests/samplers
-  - tests/conftest.py
-  commands:
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-
- label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/model_executor/guided_decoding
-  - tests/test_logits_processor
-  - tests/model_executor/test_guided_processors
-  commands: 
-    - pytest -v -s test_logits_processor.py
-    - pytest -v -s model_executor/test_guided_processors.py
-
- label: Speculative decoding tests # 40min
-  source_file_dependencies:
-  - vllm/spec_decode
-  - tests/spec_decode
-  - vllm/model_executor/models/eagle.py
-  commands:
-    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
-    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
-
- label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/lora
-  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
-  parallelism: 4
-
- label: "PyTorch Fullgraph Smoke Test" # 9min
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  # these tests need to be separated, cannot combine
-  - pytest -v -s compile/piecewise/test_simple.py
-  - pytest -v -s compile/piecewise/test_toy_llama.py
-
- label: "PyTorch Fullgraph Test" # 18min
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  - pytest -v -s compile/test_full_graph.py
-
- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - csrc/
-  - vllm/attention
-  - tests/kernels
-  commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
-
- label: Tensorizer Test # 11min
-  mirror_hardwares: [amd]
-  soft_fail: true
-  source_file_dependencies:
-  - vllm/model_executor/model_loader
-  - tests/tensorizer_loader
-  commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s tensorizer_loader
-
- label: Benchmarks # 9min
-  working_dir: "/vllm-workspace/.buildkite"
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - benchmarks/
-  commands:
-  - bash run-benchmarks.sh
-
- label: Quantization Test # 33min
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
-
- label: LM Eval Small Models # 53min
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
-
- label: Encoder Decoder tests # 5min
-  source_file_dependencies:
-  - vllm/
-  - tests/encoder_decoder
-  commands:
-    - pytest -v -s encoder_decoder
-
- label: OpenAI-Compatible Tool Use # 20 min
-  fast_check: false
-  mirror_hardwares: [ amd ]
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
-
-#####  models test  #####
-
- label: Basic Models Test # 24min
-  source_file_dependencies:
-  - vllm/
-  - tests/models
-  commands:
-    - pytest -v -s models/test_transformers.py
-    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_initialization.py
-
- label: Language Models Test (Standard) # 32min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
-  commands:
-    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/language -m core_model
-
- label: Language Models Test (Extended) # 1h10min
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
-  commands:
-    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/language -m 'not core_model'
-
- label: Multi-Modal Models Test (Standard) # 40min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/audio_language
-  - tests/models/encoder_decoder/vision_language
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal
-    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/vision_language -m core_model
-    - pytest -v -s models/encoder_decoder/audio_language -m core_model
-    - pytest -v -s models/encoder_decoder/language -m core_model
-    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-
- label: Multi-Modal Models Test (Extended) 1 # 48m
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/vision_language
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    # HACK - run phi3v tests separately to sidestep this transformers bug
-    # https://github.com/huggingface/transformers/issues/34307
-    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
-
- label: Multi-Modal Models Test (Extended) 2 # 38m
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/vision_language
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
-
-# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
-  optional: true
-  commands:
-    - echo 'Testing custom models...'
-    # PR authors can temporarily add commands below to test individual models
-    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
-    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
-
-#####  1 GPU test  #####
-#####  multi gpus test  #####
-
- label: Distributed Comm Ops Test # 7min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed
-  - tests/distributed
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py
-
- label: 2 Node Tests (4 GPUs in total) # 16min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  num_nodes: 2
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
- label: Distributed Tests (2 GPUs) # 40min
-  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  - vllm/compilation
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/model_runner.py
-  - entrypoints/llm/test_collective_rpc.py
-  commands:
-  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
-  - pytest -v -s ./compile/test_basic_correctness.py
-  - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
-  # this test fails consistently.
-  # TODO: investigate and fix
-  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
-
- label: Plugin Tests (2 GPUs) # 40min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  fast_check: true
-  source_file_dependencies:
-  - vllm/plugins/
-  - tests/plugins/
-  commands:
-  # begin platform plugin tests, all the code in-between runs on dummy platform
-  - pip install -e ./plugins/vllm_add_dummy_platform
-  - pytest -v -s plugins_tests/test_platform_plugins.py
-  - pip uninstall vllm_add_dummy_platform -y
-  # end platform plugin tests
-  # other tests continue here:
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
-  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-
- label: Multi-step Tests (4 GPUs) # 36min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/model_executor/layers/sampler.py
-  - vllm/sequence.py
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/multi_step_worker.py
-  - vllm/worker/model_runner_base.py
-  - vllm/worker/model_runner.py
-  - vllm/worker/multi_step_model_runner.py
-  - vllm/engine
-  - tests/multi_step
-  commands:
-  # this test is quite flaky
-  # TODO: investigate and fix.
-  # - pytest -v -s multi_step/test_correctness_async_llm.py
-  - pytest -v -s multi_step/test_correctness_llm.py
-
- label: Pipeline Parallelism Test # 45min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
-
- label: LoRA TP Test (Distributed)
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/lora
-  - tests/lora
-  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # This test runs llama 13B, so it is required to run on 4 GPUs.
-    - pytest -v -s -x lora/test_long_context.py
-    # There is some Tensor Parallelism related processing logic in LoRA that 
-    # requires multi-GPU testing for validation.
-    - pytest -v -s -x lora/test_chatglm3_tp.py
-    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_minicpmv_tp.py
-
-
- label: Weight Loading Multiple GPU Test  # 33min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
-  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
-
- label: Weight Loading Multiple GPU Test - Large Models # optional
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+- label: Distributed Tests (A100)
  gpu: a100
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
-  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
-
-
-##### multi gpus test #####
-##### A100 test #####
-
- label: Distributed Tests (A100) # optional
-  gpu: a100
-  optional: true
  num_gpus: 4
-  source_file_dependencies:
-  - vllm/
  commands: 
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
-  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s -x lora/test_mixtral.py
-
- label: LM Eval Large Models # optional
-  gpu: a100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@ -1,71 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-# Assume wheels are in artifacts/dist/*.whl
-wheel_files=(artifacts/dist/*.whl)
-
-# Check that exactly one wheel is found
-if [[ ${#wheel_files[@]} -ne 1 ]]; then
-  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
-  exit 1
-fi
-
-# Get the single wheel file
-wheel="${wheel_files[0]}"
-
-# Rename 'linux' to 'manylinux1' in the wheel filename
-new_wheel="${wheel/linux/manylinux1}"
-mv -- "$wheel" "$new_wheel"
-wheel="$new_wheel"
-
-# Extract the version from the wheel
-version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
-
-normal_wheel="$wheel" # Save the original wheel filename
-
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
-
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
-
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-else
-    # only upload index.html for cu12 wheels (default wheels)
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
-fi
-
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-else
-    # only upload index.html for cu12 wheels (default wheels)
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-fi
-
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.dockerignore
+++ b/.dockerignore
@ -1,33 +1 @@
-/.venv
-/build
-dist
 vllm/*.so
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-.mypy_cache
-
-# Distribution / packaging
-.Python
-/build/
-cmake-build-*/
-CMakeUserPresets.json
-develop-eggs/
-/dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1,36 +0,0 @@
-# See https://help.github.com/articles/about-codeowners/
-# for more info about CODEOWNERS file
-
-# This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin
-/vllm/multimodal @DarkLight1337 @ywang96
-CMakeLists.txt @tlrmchlsmth
-
-# vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-
-# Test ownership
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
-/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
-/tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/quantization @mgoin @robertgshaw2-redhat
-/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/distributed/test_multi_node_assignment.py @youkaichao
-/tests/distributed/test_pipeline_parallel.py @youkaichao
-/tests/distributed/test_same_node.py @youkaichao
-/tests/multi_step @alexm-redhat @comaniac
-/tests/weight_loading @mgoin @youkaichao
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -1,2 +1,2 @@
 github: [vllm-project]
-open_collective: vllm
+open_collective: [vllm]
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@ -20,10 +20,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -38,10 +38,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -36,10 +36,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -20,14 +20,9 @@ body:
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
-      <details>
-      <summary>The output of `python collect_env.py`</summary>
-
      ```text
-      Your output of `python collect_env.py` here
+      The output of `python collect_env.py`
      ```
-      
-      </details>
  validations:
    required: true
 - type: textarea
@ -89,10 +84,3 @@ body:
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.

      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/500-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml
@ -29,10 +29,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).

-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
@ -31,10 +31,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -50,10 +50,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -47,10 +47,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -19,10 +19,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -2,4 +2,63 @@ FILL IN THE PR DESCRIPTION HERE

 FIX #xxxx (*link existing issues this PR will resolve*)

-**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
+**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+
+---
+
+<details>
+<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
+<summary><b> PR Checklist (Click to Expand) </b></summary>
+
+<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
+
+<h3>PR Title and Classification</h3>
+<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
+<ul>
+    <li><code>[Bugfix]</code> for bug fixes.</li>
+    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
+    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
+    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
+    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
+    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
+    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
+    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
+    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
+</ul>
+<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
+
+<h3>Code Quality</h3>
+
+<p>The PR need to meet the following code quality standards:</p>
+
+<ul>
+    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
+    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
+    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
+    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
+    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
+</ul>
+
+<h3>Notes for Large Changes</h3>
+<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
+
+<h3>What to Expect for the Reviews</h3>
+
+<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
+
+<ul>
+    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
+    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
+    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
+    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
+ </li>
+</ul>
+
+<h3>Thank You</h3>
+
+<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
+
+
+</details>
+
+
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,31 +0,0 @@
-version: 2
-updates:
-  # Maintain dependencies for GitHub Actions
-  - package-ecosystem: "github-actions"
-    directory: "/"
-    schedule:
-      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/"
-    schedule:
-      interval: "weekly"
-    labels: ["dependencies"]
-    open-pull-requests-limit: 5
-    reviewers: ["khluu", "simon-mo"]
-    allow:
-      - dependency-type: "all"
-    ignore:
-      - dependency-name: "*"
-        update-types: ["version-update:semver-patch"]
-      - dependency-name: "torch"
-      - dependency-name: "torchvision"
-      - dependency-name: "xformers"
-      - dependency-name: "lm-format-enforcer"
-      - dependency-name: "gguf"
-      - dependency-name: "compressed-tensors"
-      - dependency-name: "ray[adag]"
-      - dependency-name: "lm-eval"
-    groups:
-      minor-update:
-        applies-to: version-updates
-        update-types: ["minor"]
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -1,97 +0,0 @@
-pull_request_rules:
- name: label-documentation
-  description: Automatically apply documentation label
-  conditions:
-    - or:
-      - files~=^[^/]+\.md$
-      - files~=^docs/
-  actions:
-    label:
-      add:
-        - documentation
-
- name: label-ci-build
-  description: Automatically apply ci/build label
-  conditions:
-    - or:
-      - files~=^\.github/
-      - files~=\.buildkite/
-      - files~=^cmake/
-      - files=CMakeLists.txt
-      - files~=^Dockerfile
-      - files~=^requirements.*\.txt
-      - files=setup.py
-  actions:
-    label:
-      add:
-        - ci/build
-
- name: label-frontend
-  description: Automatically apply frontend label
-  conditions:
-    - files~=^vllm/entrypoints/
-  actions:
-    label:
-      add:
-        - frontend
-
- name: label-structured-output
-  description: Automatically apply structured-output label
-  conditions:
-    - or:
-      - files~=^vllm/model_executor/guided_decoding/
-      - files=tests/model_executor/test_guided_processors.py
-      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=benchmarks/benchmark_serving_guided.py
-      - files=benchmarks/benchmark_guided.py
-  actions:
-    label:
-      add:
-        - structured-output
-
- name: label-speculative-decoding
-  description: Automatically apply speculative-decoding label
-  conditions:
-    - or:
-      - files~=^vllm/spec_decode/
-      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
-      - files~=^tests/spec_decode/
-  actions:
-    label:
-      add:
-        - speculative-decoding
-
- name: label-v1
-  description: Automatically apply v1 label
-  conditions:
-    - or:
-      - files~=^vllm/v1/
-      - files~=^tests/v1/
-  actions:
-    label:
-      add:
-        - v1
-
- name: ping author on conflicts and add 'needs-rebase' label
-  conditions:
-      - conflict
-      - -closed
-  actions:
-    label:
-      add:
-        - needs-rebase
-    comment:
-      message: |
-       This pull request has merge conflicts that must be resolved before it can be
-       merged. Please rebase the PR, @{{author}}.
-
-       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
-
- name: remove 'needs-rebase' label when conflict is resolved
-  conditions:
-      - -conflict
-      - -closed
-  actions:
-    label:
-      remove:
-        - needs-rebase
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -1,50 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-# ensure 1 argument is passed
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <pr_number>"
-    exit 1
-fi
-
-PR_NUMBER=$1
-OLD=/tmp/orig_pr_body.txt
-NEW=/tmp/new_pr_body.txt
-
-gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
-cp "${OLD}" "${NEW}"
-
-# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
-sed -i '/FIX #xxxx.*$/d' "${NEW}"
-
-# Remove "FILL IN THE PR DESCRIPTION HERE"
-sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
-
-# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
-sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
-
-# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
-python3 - <<EOF
-import re
-
-with open("${NEW}", "r") as file:
-    content = file.read()
-
-pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
-content = re.sub(pattern, '', content)
-
-with open("${NEW}", "w") as file:
-    file.write(content)
-EOF
-
-# Run this only if ${NEW} is different than ${OLD}
-if ! cmp -s "${OLD}" "${NEW}"; then
-    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
-    echo
-    echo "Updated PR body:"
-    echo
-    cat "${NEW}"
-else
-    echo "No changes needed"
-fi
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -8,7 +8,7 @@ jobs:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
-                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+                uses: actions/github-script@v5
                with:
                    script: |
                        github.rest.issues.addLabels({
--- a/.github/workflows/add_label_ready_comment.yml
+++ b/.github/workflows/add_label_ready_comment.yml
@ -0,0 +1,23 @@
+name: Add Ready Label on Ready Comment
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  add-ready-label:
+    runs-on: ubuntu-latest
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
+    steps:
+        -   name: Add label
+            uses: actions/github-script@v5
+            with:
+                script: |
+                    github.rest.issues.addLabels({
+                        owner: context.repo.owner,
+                        repo: context.repo.repo,
+                        issue_number: context.issue.number,
+                        labels: ['ready']
+                    })
+            env:
+                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -0,0 +1,42 @@
+name: clang-format
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  clang-format:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install clang-format==18.1.5
+    - name: Running clang-format
+      run: |
+        EXCLUDES=(
+            'csrc/moe/topk_softmax_kernels.cu'
+            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/punica/punica_ops.cu'
+            'csrc/punica/type_convert.h'
+        )
+        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
+            | xargs clang-format --dry-run --Werror
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -1,26 +0,0 @@
-name: Cleanup PR Body
-
-on:
-  pull_request_target:
-    types: [opened, reopened, edited]
-
-permissions:
-  pull-requests: write
-
-jobs:
-  update-description:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Set up Python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: '3.12'
-
-      - name: Update PR description
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -1,82 +0,0 @@
-name: Lint and Deploy Charts
-
-on: pull_request
-
-jobs:
-  lint-and-deploy:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: Set up Helm
-        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
-        with:
-          version: v3.14.4
-
-       #Python is required because ct lint runs Yamale and yamllint which require Python.
-      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: '3.13'
-
-      - name: Set up chart-testing
-        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
-        with:
-          version: v3.10.1
-
-      - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
-
-      - name: Setup minio
-        run: |
-          docker network create vllm-net
-          docker run -d -p 9000:9000 --name minio --net vllm-net \
-                     -e "MINIO_ACCESS_KEY=minioadmin" \
-                     -e "MINIO_SECRET_KEY=minioadmin" \
-                     -v /tmp/data:/data \
-                     -v /tmp/config:/root/.minio \
-                     minio/minio server /data
-          export AWS_ACCESS_KEY_ID=minioadmin
-          export AWS_SECRET_ACCESS_KEY=minioadmin
-          export AWS_EC2_METADATA_DISABLED=true
-          mkdir opt-125m
-          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
-          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
-          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
-
-      - name: Create kind cluster
-        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
-
-      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
-
-      - name: Configuration of docker images, network and namespace for the kind cluster
-        run: |
-          docker pull amazon/aws-cli:2.6.4
-          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
-          kind load docker-image vllm-cpu-env:latest --name chart-testing
-          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
-          kubectl create ns ns-vllm
-
-      - name: Run chart-testing (install)
-        run: |
-          export AWS_ACCESS_KEY_ID=minioadmin
-          export AWS_SECRET_ACCESS_KEY=minioadmin
-          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-    
-      - name: curl test
-        run: |
-          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
-          sleep 10
-          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
-                  --header "Content-Type: application/json" \
-                  --data '{
-                          "model": "opt-125m",
-                          "prompt": "San Francisco is a",
-                          "max_tokens": 7,
-                          "temperature": 0
-                  }'):$CODE"
-          echo "$CODE"
--- a/.github/workflows/matchers/actionlint.json
+++ b/.github/workflows/matchers/actionlint.json
@ -1,17 +0,0 @@
-{
-  "problemMatcher": [
-    {
-      "owner": "actionlint",
-      "pattern": [
-        {
-          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
-          "file": 1,
-          "line": 2,
-          "column": 3,
-          "message": 4,
-          "code": 5
-        }
-      ]
-    }
-  ]
-}
--- a/.github/workflows/matchers/mypy.json
+++ b/.github/workflows/matchers/mypy.json
@ -1,16 +0,0 @@
-{
-  "problemMatcher": [
-    {
-      "owner": "mypy",
-      "pattern": [
-        {
-          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
-          "file": 1,
-          "line": 2,
-          "severity": 3,
-          "message": 4
-        }
-      ]
-    }
-  ]
-}
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -0,0 +1,53 @@
+name: mypy
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install mypy==1.9.0
+        pip install types-setuptools
+        pip install types-PyYAML
+        pip install types-requests
+        pip install types-setuptools
+    - name: Mypy
+      run: |
+        mypy tests --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/inputs --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
+        mypy vllm/platforms --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -1,19 +0,0 @@
-name: pre-commit
-
-on:
-  pull_request:
-  push:
-    branches: [main]
-
-jobs:
-  pre-commit:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: "3.12"
-    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-      with:
-        extra_args: --all-files --hook-stage manual
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -21,16 +21,16 @@ jobs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@v3

      - name: Extract branch info
        shell: bash
        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV

      - name: Create Release
        id: create_release
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: "actions/github-script@v6"
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
@ -39,68 +39,67 @@ jobs:
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)

-  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
-  # wheel:
-  #   name: Build Wheel
-  #   runs-on: ${{ matrix.os }}
-  #   needs: release
+  wheel:
+    name: Build Wheel
+    runs-on: ${{ matrix.os }}
+    needs: release

-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #         os: ['ubuntu-20.04']
-  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
-  #         cuda-version: ['11.8', '12.1']
+    strategy:
+      fail-fast: false
+      matrix:
+          os: ['ubuntu-20.04']
+          python-version: ['3.8', '3.9', '3.10', '3.11']
+          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
+          cuda-version: ['11.8', '12.1']

-  #   steps:
-  #     - name: Checkout
-  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3

-  #     - name: Setup ccache
-  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-  #       with:
-  #         create-symlink: true
-  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          create-symlink: true
+          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}

-  #     - name: Set up Linux Env
-  #       if: ${{ runner.os == 'Linux' }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/env.sh
+      - name: Set up Linux Env
+        if: ${{ runner.os == 'Linux' }}
+        run: |
+          bash -x .github/workflows/scripts/env.sh

-  #     - name: Set up Python
-  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-  #       with:
-  #           python-version: ${{ matrix.python-version }}
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+            python-version: ${{ matrix.python-version }}

-  #     - name: Install CUDA ${{ matrix.cuda-version }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+      - name: Install CUDA ${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}

-  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}

-  #     - name: Build wheel
-  #       shell: bash
-  #       env:
-  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-  #       run: |
-  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-  #         asset_name=${wheel_name//"linux"/"manylinux1"}
-  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+      - name: Build wheel
+        shell: bash
+        env:
+          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+        run: |
+          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          asset_name=${wheel_name//"linux"/"manylinux1"}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "asset_name=${asset_name}" >> $GITHUB_ENV

-  #     - name: Upload Release Asset
-  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         upload_url: ${{ needs.release.outputs.upload_url }}
-  #         asset_path: ./dist/${{ env.wheel_name }}
-  #         asset_name: ${{ env.asset_name }}
-  #         asset_content_type: application/*
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_name }}
+          asset_name: ${{ env.asset_name }}
+          asset_content_type: application/*

      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
      # - name: Publish package
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -2,24 +2,20 @@ name: PR Reminder Comment Bot
 on:
  pull_request_target:
    types: [opened]
+
 jobs:
  pr_reminder:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@v6
        with:
          script: |
            github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
-                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-                '🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -0,0 +1,37 @@
+name: ruff
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
+    - name: Analysing the code with ruff
+      run: |
+        ruff .
+    - name: Spelling check with codespell
+      run: |
+        codespell --toml pyproject.toml
+    - name: Run isort
+      run: |
+        isort . --check-only
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -1,5 +1,4 @@
 #!/bin/bash
-set -eux

 python_executable=python$1
 cuda_home=/usr/local/cuda-$2
@ -9,15 +8,14 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH

 # Install requirements
-$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
+$python_executable -m pip install wheel packaging
+$python_executable -m pip install -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
-
-bash tools/check_repo.sh
-
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@ -1,16 +1,16 @@
 #!/bin/bash

 # Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo "$1" | tr "." "-")
+cuda_version=$(echo $1 | tr "." "-")
 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo "$2" | tr -d ".\-")
+OS=$(echo $2 | tr -d ".\-")

 # Installs CUDA
-wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
 rm cuda-keyring_1.1-1_all.deb
 sudo apt -qq update
-sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
+sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
 sudo apt clean

 # Test nvcc
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@ -6,7 +6,7 @@ cuda_version=$3

 # Install torch
 $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
+$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}

 # Print version information
 $python_executable --version
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -1,52 +0,0 @@
-name: 'Close inactive issues and PRs'
-
-on:
-  schedule:
-    # Daily at 1:30 AM UTC
-    - cron: '30 1 * * *'
-
-jobs:
-  close-issues-and-pull-requests:
-    permissions:
-      issues: write
-      pull-requests: write
-      actions: write
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
-        with:
-          # Increasing this value ensures that changes to this workflow
-          # propagate to all issues and PRs in days rather than months
-          operations-per-run: 1000
-
-          exempt-draft-pr: true
-          exempt-issue-labels: 'keep-open'
-          exempt-pr-labels: 'keep-open'
-
-          labels-to-add-when-unstale: 'unstale'
-          labels-to-remove-when-stale: 'unstale'
-
-          days-before-issue-stale: 90
-          days-before-issue-close: 30
-          stale-issue-label: 'stale'
-          stale-issue-message: >
-            This issue has been automatically marked as stale because it has not
-            had any activity within 90 days. It will be automatically closed if no
-            further activity occurs within 30 days. Leave a comment if
-            you feel this issue should remain open. Thank you!
-          close-issue-message: >
-            This issue has been automatically closed due to inactivity. Please
-            feel free to reopen if you feel it is still relevant. Thank you!
-
-          days-before-pr-stale: 90
-          days-before-pr-close: 30
-          stale-pr-label: 'stale'
-          stale-pr-message: >
-            This pull request has been automatically marked as stale because it
-            has not had any activity within 90 days. It will be automatically
-            closed if no further activity occurs within 30 days. Leave a comment
-            if you feel this pull request should remain open. Thank you!
-          close-pr-message: >
-            This pull request has been automatically closed due to inactivity.
-            Please feel free to reopen if you intend to continue working on it.
-            Thank you!
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -0,0 +1,31 @@
+name: yapf
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+jobs:
+  yapf:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install yapf==0.32.0
+        pip install toml==0.10.2
+    - name: Running yapf
+      run: |
+        yapf --diff --recursive .
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,5 @@
-# version file generated by setuptools-scm
-/vllm/_version.py
-
-# vllm-flash-attn built from source
-vllm/vllm_flash_attn/
+# vllm commit id, generated by setup.py
+vllm/commit_id.py

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -15,8 +12,6 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
-cmake-build-*/
-CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/
@ -33,7 +28,6 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
-/.deps/

 # PyInstaller
 #  Usually these files are written by a python script from a template
@ -79,7 +73,8 @@ instance/

 # Sphinx documentation
 docs/_build/
-docs/source/getting_started/examples/
+docs/source/getting_started/examples/*.rst
+!**/*.template.rst

 # PyBuilder
 .pybuilder/
@ -92,9 +87,6 @@ target/
 profile_default/
 ipython_config.py

-# generated files
-**/generated/**
-
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
@ -197,8 +189,4 @@ _build/
 hip_compat.h

 # Benchmark dataset
-benchmarks/*.json
-
-# Linting
-actionlint
-shellcheck*/
+*.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,110 +0,0 @@
-default_stages:
-  - pre-commit # Run locally
-  - manual # Run in CI
-repos:
- repo: https://github.com/google/yapf
-  rev: v0.43.0
-  hooks:
-  - id: yapf
-    args: [--in-place, --verbose]
-    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.3
-  hooks:
-  - id: ruff
-    args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
-  rev: v2.4.0
-  hooks:
-  - id: codespell
-    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
- repo: https://github.com/PyCQA/isort
-  rev: 5.13.2
-  hooks:
-  - id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v19.1.7
-  hooks:
-  - id: clang-format
-    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
-    types_or: [c++, cuda]
-    args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.27
-  hooks:
-  - id: pymarkdown
-    files: docs/.*
- repo: https://github.com/rhysd/actionlint
-  rev: v1.7.7
-  hooks:
-  - id: actionlint
- repo: local
-  hooks:
-  - id: mypy-local
-    name: Run mypy for local Python installation
-    entry: tools/mypy.sh 0 "local"
-    language: python
-    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
-    stages: [pre-commit] # Don't run in CI
-  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.9
-    entry: tools/mypy.sh 1 "3.9"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
-  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.10
-    entry: tools/mypy.sh 1 "3.10"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
-  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.11
-    entry: tools/mypy.sh 1 "3.11"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
-  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.12
-    entry: tools/mypy.sh 1 "3.12"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
-  - id: shellcheck
-    name: Lint shell scripts
-    entry: tools/shellcheck.sh
-    language: script
-    types: [shell]
-  - id: png-lint
-    name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
-    language: script
-    types: [png]
-  - id: signoff-commit
-    name: Sign-off Commit
-    entry: bash
-    args:
-      - -c
-      - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
-        fi
-    language: system
-    verbose: true
-    stages: [commit-msg]
-  - id: check-spdx-header
-    name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
-    language: python
-    types: [python]
-  - id: suggestion
-    name: Suggestion
-    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
-    language: system
-    verbose: true
-    pass_filenames: false
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -6,16 +6,16 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.12"
+    python: "3.8"

 sphinx:
-  configuration: docs/source/conf.py
-  fail_on_warning: true
+   configuration: docs/source/conf.py

 # If using Sphinx, optionally build your docs in additional formats such as PDF
-formats: []
+formats:
+   - pdf

 # Optionally declare the Python requirements required to build your docs
 python:
-  install:
-    - requirements: docs/requirements-docs.txt
+   install:
+   - requirements: docs/requirements-docs.txt
--- a/.shellcheckrc
+++ b/.shellcheckrc
@ -1,9 +0,0 @@
-# rules currently disabled:
-#
-#   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
-#   SC2004 (style): $/${} is unnecessary on arithmetic variables.
-#   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
-#   SC2155 (warning): Declare and assign separately to avoid masking return values.
-#   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
-#
-disable=SC1091,SC2004,SC2129,SC2155,SC2164
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,16 +1,5 @@
-cmake_minimum_required(VERSION 3.26)
+cmake_minimum_required(VERSION 3.21)

-# When building directly using CMake, make sure you run the install step
-# (it places the .so files in the correct location).
-#
-# Example:
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
-# cmake --build . --target install
-#
-# If you want to only build one target, make sure to install it manually:
-# cmake --build . --target _C
-# cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)

 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@ -21,20 +10,17 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

-# Suppress potential warnings about unused manually-specified variables
-set(ignoreMe "${VLLM_PYTHON_PATH}")
-
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")

 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -46,8 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")

 #
 # Try to find python package with an executable that exactly matches
@ -88,7 +74,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
    else()
-        return()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
    endif()
    return()
 endif()
@ -122,32 +108,14 @@ else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()

-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  #
-  # For cuda we want to be able to control which architectures we compile for on
-  # a per-file basis in order to cut down on compile time. So here we extract
-  # the set of architectures we want to compile for and remove the from the
-  # CMAKE_CUDA_FLAGS so that they are not applied globally.
-  #
-  clear_cuda_arches(CUDA_ARCH_FLAGS)
-  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
-  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
-  # Filter the target architectures by the supported supported archs
-  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS
-    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
-  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
-else()
-  #
-  # For other GPU targets override the GPU architectures detected by cmake/torch
-  # and filter them by the supported versions for the current language.
-  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
-  #
-  override_gpu_arches(VLLM_GPU_ARCHES
-    ${VLLM_GPU_LANG}
-    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
-endif()
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# the supported versions for the current language.
+# The final set of arches is stored in `VLLM_GPU_ARCHES`.
+#
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")

 #
 # Query torch for additional GPU compilation flags for the given
@ -163,45 +131,9 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

-
 #
-# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
-# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
-# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
-# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
+# Define extension targets
 #
-include(FetchContent)
-file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
-message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
-
-#
-# Define other extension targets
-#
-
-#
-# cumem_allocator extension
-#
-
-set(VLLM_CUMEM_EXT_SRC
-  "csrc/cumem_allocator.cpp")
-
-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_CUMEM_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
-  list(APPEND CUMEM_LIBS cuda)
-  define_gpu_extension_target(
-    cumem_allocator
-    DESTINATION vllm
-    LANGUAGE CXX
-    SOURCES ${VLLM_CUMEM_EXT_SRC}
-    LIBRARIES ${CUMEM_LIBS}
-    USE_SABI 3.8
-    WITH_SOABI)
-endif()

 #
 # _C extension
@ -209,248 +141,58 @@ endif()

 set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
-  "csrc/attention/paged_attention_v1.cu"
-  "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/attention_kernels.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
-  "csrc/layernorm_quant_kernels.cu"
+  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
-  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
-  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
+  "csrc/moe_align_block_size_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
-
-  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
-
-  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
-  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
-    set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
-  endif()
-
-  if(VLLM_CUTLASS_SRC_DIR)
-    if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
-      get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
-    endif()
-    message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
-    FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
-  else()
-    FetchContent_Declare(
+  include(FetchContent)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.7.0
-        GIT_PROGRESS TRUE
-
-        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
-        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
-        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
-    )
-  endif()
+        # CUTLASS 3.5.0
+        GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+  )
  FetchContent_MakeAvailable(cutlass)

  list(APPEND VLLM_EXT_SRC
-    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
-    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
+    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
-    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
-    "csrc/cutlass_extensions/common.cpp")
-
-  set_gencode_flags_for_srcs(
-    SRCS "${VLLM_EXT_SRC}"
-    CUDA_ARCHS "${CUDA_ARCHS}")
-
-  # Only build Marlin kernels if we are building for at least some compatible archs.
-  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
-  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
-  if (MARLIN_ARCHS)
-    set(MARLIN_SRCS
-       "csrc/quantization/fp8/fp8_marlin.cu"
-       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
-       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_SRCS}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
-    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
-  else()
-    message(STATUS "Not building Marlin kernels as no compatible archs found"
-                   " in CUDA target architectures")
-  endif()
-
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS 
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
-    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
-  endif()
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")

  #
-  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
-  # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
-  # subtract out the archs that are already built for 3x
-  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
-  if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
-    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
-  else()
-    if (SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
-                     " for and covered by scaled_mm_c3x")
-    else()
-      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
-                    "in CUDA target architectures")
-    endif()
+  # The CUTLASS kernels for Hopper require sm90a to be enabled.
+  # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
+  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+    set_source_files_properties(
+          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+          PROPERTIES
+          COMPILE_FLAGS
+          "-gencode arch=compute_90a,code=sm_90a")
  endif()

-  #
-  # 2:4 Sparse Kernels
-
-  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
-             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
-    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
-                     "if you intend on running FP8 sparse quantized models on Hopper.")
-    else()
-      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()
-
-
-  #
-  # Machete kernels
-
-  # The machete kernels only work on hopper and require CUDA 12.0 or later.
-  # Only build Machete kernels if we are building for something compatible with sm90a
-  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
-    #
-    # For the Machete kernels we automatically generate sources for various
-    # preselected input type pairs and schedules.
-    # Generate sources:
-    set(MACHETE_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
-    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
-
-    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
-
-    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
-      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
-          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
-        RESULT_VARIABLE machete_generation_result
-        OUTPUT_VARIABLE machete_generation_output
-        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-      )
-
-      if (NOT machete_generation_result EQUAL 0)
-        message(FATAL_ERROR "Machete generation failed."
-                            " Result: \"${machete_generation_result}\""
-                            "\nCheck the log for details: "
-                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
-      else()
-        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run machete generate script hash" FORCE)
-        message(STATUS "Machete generation completed successfully.")
-      endif()
-    else()
-      message(STATUS "Machete generation script has not changed, skipping generation.")
-    endif()
-
-    # Add machete generated sources
-    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
-    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
-
-    # forward compatible
-    set_gencode_flags_for_srcs(
-      SRCS "${MACHETE_GEN_SOURCES}"
-      CUDA_ARCHS "${MACHETE_ARCHS}")
-
-    list(APPEND VLLM_EXT_SRC
-      csrc/quantization/machete/machete_pytorch.cu)
-
-    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
-        AND MACHETE_ARCHS)
-      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running w4a16 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building Machete kernels as no compatible archs "
-                     "found in CUDA target architectures")
-    endif()
-  endif()
-# if CUDA endif
 endif()

-message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
  DESTINATION vllm
@ -462,51 +204,14 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
-# driver API. This causes problems when linking with earlier versions of CUDA.
-# Setting this variable sidesteps the issue by calling the driver directly.
-target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
-
 #
 # _moe_C extension
 #

 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
-  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_MOE_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
-  if (MARLIN_MOE_ARCHS)
-    set(MARLIN_MOE_SRC
-        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
-        "csrc/moe/marlin_moe_ops.cu")
-
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
-      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-
-    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
-    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
-  else()
-    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
-                   " in CUDA target architectures")
-  endif()
-endif()
-
-message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
  DESTINATION vllm
@ -517,96 +222,90 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

-if(VLLM_GPU_LANG STREQUAL "HIP")
-  #
-  # _rocm_C extension
-  #
-  set(VLLM_ROCM_EXT_SRC
-    "csrc/rocm/torch_bindings.cpp"
-    "csrc/rocm/attention.cu")
+#
+# _punica_C extension
+#

+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/punica_ops.cu"
+  "csrc/punica/torch_bindings.cpp")
+
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+
+if (VLLM_PUNICA_GPU_ARCHES)
  define_gpu_extension_target(
-    _rocm_C
+    _punica_C
    DESTINATION vllm
    LANGUAGE ${VLLM_GPU_LANG}
-    SOURCES ${VLLM_ROCM_EXT_SRC}
-    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
    USE_SABI 3
    WITH_SOABI)
-endif()
-
-# vllm-flash-attn currently only supported on CUDA
-if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
-  return()
-endif ()
-
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
-# arches in the CUDA case (and instead set the gencodes on a per file basis)
-# we need to manually set VLLM_GPU_ARCHES here.
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  foreach(_ARCH ${CUDA_ARCHS})
-    string(REPLACE "." "" _ARCH "${_ARCH}")
-    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
-  endforeach()
-endif()
-
-#
-# Build vLLM flash attention from source
-#
-# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
-# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
-# They should be identical but if they aren't, this is a massive footgun.
-#
-# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
-# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
-# If no component is specified, vllm-flash-attn is still installed.
-
-# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
-# This is to enable local development of vllm-flash-attn within vLLM.
-# It can be set as an environment variable or passed as a cmake argument.
-# The environment variable takes precedence.
-if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
-  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
-endif()
-
-if(VLLM_FLASH_ATTN_SRC_DIR)
-  FetchContent_Declare(
-          vllm-flash-attn SOURCE_DIR 
-          ${VLLM_FLASH_ATTN_SRC_DIR}
-          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
-  )
 else()
-  FetchContent_Declare(
-          vllm-flash-attn
-          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
-          GIT_PROGRESS TRUE
-          # Don't share the vllm-flash-attn build between build types
-          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
-  )
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
 endif()

+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)

-# Fetch the vllm-flash-attn library
-FetchContent_MakeAvailable(vllm-flash-attn)
-message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+  message(STATUS "Enabling C extension.")
+  add_dependencies(default _C)

-# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
-# case only one is built, in the case both are built redundant work is done)
-install(
-  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
-  COMPONENT _vllm_fa2_C
-  FILES_MATCHING PATTERN "*.py"
-)
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)

-install(
-  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
-  COMPONENT _vllm_fa3_C
-  FILES_MATCHING PATTERN "*.py"
-)
-
-# Nothing after vllm-flash-attn, see comment about macros above
+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(default _punica_C)
+  endif()
+endif()
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -1,128 +0,0 @@
-
-# vLLM Code of Conduct
-
-## Our Pledge
-
-We as members, contributors, and leaders pledge to make participation in our
-community a harassment-free experience for everyone, regardless of age, body
-size, visible or invisible disability, ethnicity, sex characteristics, gender
-identity and expression, level of experience, education, socioeconomic status,
-nationality, personal appearance, race, caste, color, religion, or sexual
-identity and orientation.
-
-We pledge to act and interact in ways that contribute to an open, welcoming,
-diverse, inclusive, and healthy community.
-
-## Our Standards
-
-Examples of behavior that contributes to a positive environment for our
-community include:
-
-* Demonstrating empathy and kindness toward other people
-* Being respectful of differing opinions, viewpoints, and experiences
-* Giving and gracefully accepting constructive feedback
-* Accepting responsibility and apologizing to those affected by our mistakes,
-  and learning from the experience
-* Focusing on what is best not just for us as individuals, but for the overall
-  community
-
-Examples of unacceptable behavior include:
-
-* The use of sexualized language or imagery, and sexual attention or advances of
-  any kind
-* Trolling, insulting or derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or email address,
-  without their explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
-  professional setting
-
-## Enforcement Responsibilities
-
-Community leaders are responsible for clarifying and enforcing our standards of
-acceptable behavior and will take appropriate and fair corrective action in
-response to any behavior that they deem inappropriate, threatening, offensive,
-or harmful.
-
-Community leaders have the right and responsibility to remove, edit, or reject
-comments, commits, code, wiki edits, issues, and other contributions that are
-not aligned to this Code of Conduct, and will communicate reasons for moderation
-decisions when appropriate.
-
-## Scope
-
-This Code of Conduct applies within all community spaces, and also applies when
-an individual is officially representing the community in public spaces.
-Examples of representing our community include using an official email address,
-posting via an official social media account, or acting as an appointed
-representative at an online or offline/IRL event.
-
-## Enforcement
-
-Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Slack](https://slack.vllm.ai).
-All complaints will be reviewed and investigated promptly and fairly.
-
-All community leaders are obligated to respect the privacy and security of the
-reporter of any incident.
-
-## Enforcement Guidelines
-
-Community leaders will follow these Community Impact Guidelines in determining
-the consequences for any action they deem in violation of this Code of Conduct:
-
-### 1. Correction
-
-**Community Impact**: Use of inappropriate language or other behavior deemed
-unprofessional or unwelcome in the community.
-
-**Consequence**: A private, written warning from community leaders, providing
-clarity around the nature of the violation and an explanation of why the
-behavior was inappropriate. A public apology may be requested.
-
-### 2. Warning
-
-**Community Impact**: A violation through a single incident or series of
-actions.
-
-**Consequence**: A warning with consequences for continued behavior. No
-interaction with the people involved, including unsolicited interaction with
-those enforcing the Code of Conduct, for a specified period of time. This
-includes avoiding interactions in community spaces as well as external channels
-like social media. Violating these terms may lead to a temporary or permanent
-ban.
-
-### 3. Temporary Ban
-
-**Community Impact**: A serious violation of community standards, including
-sustained inappropriate behavior.
-
-**Consequence**: A temporary ban from any sort of interaction or public
-communication with the community for a specified period of time. No public or
-private interaction with the people involved, including unsolicited interaction
-with those enforcing the Code of Conduct, is allowed during this period.
-Violating these terms may lead to a permanent ban.
-
-### 4. Permanent Ban
-
-**Community Impact**: Demonstrating a pattern of violation of community
-standards, including sustained inappropriate behavior, harassment of an
-individual, or aggression toward or disparagement of classes of individuals.
-
-**Consequence**: A permanent ban from any sort of public interaction within the
-community.
-
-## Attribution
-
-This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
-version 2.1, available at
-[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
-
-Community Impact Guidelines were inspired by
-[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
-
-For answers to common questions about this code of conduct, see the
-[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
-[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
-
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,3 +1,56 @@
 # Contributing to vLLM

-You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
+Thank you for your interest in contributing to vLLM!
+Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
+There are several ways you can contribute to the project:
+
+- Identify and report any issues or bugs.
+- Request or add a new model.
+- Suggest or implement new features.
+
+However, remember that contributions aren't just about code.
+We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
+
+Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
+Talk about it in your blog posts, highlighting how it's driving your incredible projects.
+Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
+
+
+## Setup for development
+
+### Build from source
+
+```bash
+pip install -e .  # This may take several minutes.
+```
+
+### Testing
+
+```bash
+pip install -r requirements-dev.txt
+
+# linting and formatting
+bash format.sh
+# Static type checking
+mypy
+# Unit tests
+pytest tests/
+```
+**Note:** Currently, the repository does not pass the mypy tests.
+
+
+## Contributing Guidelines
+
+### Issue Reporting
+
+If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
+If not, please file a new issue, providing as much relevant information as possible.
+
+### Pull Requests & Code Reviews
+
+Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
+
+### Thank You
+
+Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
+Your contributions make vLLM a great tool for everyone!
--- a/34
+++ b/34
@ -1,34 +0,0 @@
-Developer Certificate of Origin
-Version 1.1
-
-Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
-
-Everyone is permitted to copy and distribute verbatim copies of this
-license document, but changing it is not allowed.
-
-
-Developer's Certificate of Origin 1.1
-
-By making a contribution to this project, I certify that:
-
-(a) The contribution was created in whole or in part by me and I
-    have the right to submit it under the open source license
-    indicated in the file; or
-
-(b) The contribution is based upon previous work that, to the best
-    of my knowledge, is covered under an appropriate open source
-    license and I have the right under that license to submit that
-    work with modifications, whether created in whole or in part
-    by me, under the same open source license (unless I am
-    permitted to submit under a different license), as indicated
-    in the file; or
-
-(c) The contribution was provided directly to me by some other
-    person who certified (a), (b) or (c) and I have not modified
-    it.
-
-(d) I understand and agree that this project and the contribution
-    are public and that a record of the contribution (including all
-    personal information I submit with it, including my sign-off) is
-    maintained indefinitely and may be redistributed consistent with
-    this project or the open source license(s) involved.
--- a/217
+++ b/217
@ -2,39 +2,35 @@
 # to run the OpenAI compatible server.

 # Please update any changes made here to
-# docs/source/contributing/dockerfile/dockerfile.md and
-# docs/source/assets/contributing/dockerfile-stages-dependency.png
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png

 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.12
-ARG TARGETPLATFORM
+ARG PYTHON_VERSION=3.10
+
 ENV DEBIAN_FRONTEND=noninteractive

-# Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
+    && apt-get install -y ccache software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version

-# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-# as it was causing spam when compiling the CUTLASS kernels
-RUN apt-get install -y gcc-10 g++-10
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
-RUN <<EOF
-gcc --version
-EOF
+RUN apt-get update -y \
+    && apt-get install -y git curl sudo
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -45,35 +41,27 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace

 # install build and runtime dependencies
-
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
-    fi
-
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-cuda.txt

+COPY requirements-mamba.txt requirements-mamba.txt
+RUN python3 -m pip install packaging
+RUN python3 -m pip install -r requirements-mamba.txt
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-# Override the arch list for flash-attn to reduce the binary size
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################

 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
-ARG TARGETPLATFORM
+
+ARG PYTHON_VERSION=3.10

 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@ -81,10 +69,18 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-build.txt

-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
+# files and directories related to build wheels
+COPY csrc csrc
+COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+COPY pyproject.toml pyproject.toml
+COPY vllm vllm

 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@ -92,24 +88,27 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+ARG buildkite_commit
+ENV BUILDKITE_COMMIT=${buildkite_commit}

 ARG USE_SCCACHE
-ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
-ARG SCCACHE_REGION_NAME=us-west-2
-ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
-        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
-        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
-        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && if [ "$CUDA_VERSION" = "11.8.0" ]; then \
+            export SCCACHE_BUCKET=vllm-build-sccache-2; \
+           else \
+            export SCCACHE_BUCKET=vllm-build-sccache; \
+           fi \
+        && export SCCACHE_REGION=us-west-2 \
        && export CMAKE_BUILD_TYPE=Release \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
@ -119,22 +118,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi

-# Check the size of the wheel if RUN_WHEEL_CHECK is true
+# check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-# sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=400
-ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
-ARG RUN_WHEEL_CHECK=true
-RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
-        python3 check-wheel-size.py dist; \
-    else \
-        echo "Skipping wheel size check."; \
-    fi
+RUN python3 check-wheel-size.py dist
+
 #################### EXTENSION Build IMAGE ####################

 #################### DEV IMAGE ####################
@ -145,35 +136,47 @@ COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt
+
 #################### DEV IMAGE ####################
+#################### MAMBA Build IMAGE ####################
+FROM dev as mamba-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+
+WORKDIR /usr/src/mamba
+
+COPY requirements-mamba.txt requirements-mamba.txt
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel -r requirements-mamba.txt \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### MAMBA Build IMAGE ####################

 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.12
+ARG PYTHON_VERSION=3.10
 WORKDIR /vllm-workspace
-ENV DEBIAN_FRONTEND=noninteractive
-ARG TARGETPLATFORM

-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && apt-get install -y ccache software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim curl libibverbs-dev
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -181,46 +184,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
-    fi
-
-# Install vllm wheel first, so that torch etc will be installed.
+# install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install dist/*.whl --verbose

-# How to build this FlashInfer wheel:
-# $ export FLASHINFER_ENABLE_AOT=1
-# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
-# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
-# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-# $ cd flashinfer
-# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
-# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
+RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir

 RUN --mount=type=cache,target=/root/.cache/pip \
-. /etc/environment && \
-if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
-fi
-COPY examples examples
-
-# Although we build Flashinfer with AOT mode, there's still
-# some issues w.r.t. JIT compilation. Therefore we need to
-# install build dependencies for JIT compilation.
-# TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements-build.txt requirements-build.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-build.txt
-
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################

+
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@ -232,48 +209,24 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt

-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -e tests/vllm_test_utils
-
-# enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
-
-# Copy in the v1 package for testing (it isn't distributed yet)
-COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
-
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
 # will not be imported by other tests
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
+
 #################### TEST IMAGE ####################

 #################### OPENAI API SERVER ####################
-# base openai image with additional requirements, for any subsequent openai-style images
-FROM vllm-base AS vllm-openai-base
+# openai api server alternative
+FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
-    else \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
-    fi
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'

 ENV VLLM_USAGE_SOURCE production-docker-image

-# define sagemaker first, so it is not default from `docker build`
-FROM vllm-openai-base AS vllm-sagemaker
-
-COPY examples/online_serving/sagemaker-entrypoint.sh .
-RUN chmod +x sagemaker-entrypoint.sh
-ENTRYPOINT ["./sagemaker-entrypoint.sh"]
-
-FROM vllm-openai-base AS vllm-openai
-
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/Dockerfile.arm
+++ b/Dockerfile.arm
@ -1,62 +0,0 @@
-# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
-
-FROM ubuntu:22.04 AS cpu-test-arm
-
-ENV CCACHE_DIR=/root/.cache/ccache
-
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
-
-# Set LD_PRELOAD for tcmalloc on ARM
-ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements-build.txt
-
-FROM cpu-test-arm AS build
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-# Disabling AVX512 specific optimizations for ARM
-ARG VLLM_CPU_DISABLE_AVX512="true"
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
-
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -2,68 +2,40 @@

 FROM ubuntu:22.04 AS cpu-test-1

-ENV CCACHE_DIR=/root/.cache/ccache
-
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+RUN apt-get update -y \
+    && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp==2025.0.1
+RUN pip install intel-openmp

-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"

 RUN echo 'ulimit -c 0' >> ~/.bashrc

-RUN pip install intel_extension_for_pytorch==2.5.0
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl

-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements-build.txt
+RUN pip install --upgrade pip \
+    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy

 FROM cpu-test-1 AS build

+COPY ./ /workspace/vllm
+
 WORKDIR /workspace/vllm

-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu

 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -e tests/vllm_test_utils
-
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@ -1,21 +0,0 @@
-FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-RUN pip install -v -r requirements-hpu.txt
-
-ENV no_proxy=localhost,127.0.0.1
-ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
-
-RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Show More
+++ b/Show More