[Bugfix] Fix deepseekv3 grouped topk error (#13474 )

Signed-off-by: Chen-XiaoBing <chenxb002@whu.edu.cn>
Merge similar examples in offline_inference into single basic example (#12737 )
2025-10-24 00:44:37 +08:00 · 2025-02-20 06:47:01 -08:00 · 2025-02-20 04:53:51 -08:00 · 2025-02-20 07:28:06 -05:00 · 2025-02-20 01:20:15 -08:00 · 2025-02-20 16:56:00 +08:00
1781 changed files with 212110 additions and 55507 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,9 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
+# Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
 VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 def print_top_10_largest_files(zip_file):
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -0,0 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 import os
 template = """<!DOCTYPE html>
 <html>
    <body>
    <h1>Links for vLLM</h1/>
        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
    </body>
 </html>
 """
 parser = argparse.ArgumentParser()
 parser.add_argument("--wheel", help="The wheel path.", required=True)
 args = parser.parse_args()
 filename = os.path.basename(args.wheel)
 with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # cloudfront requires escaping the '+' character
    f.write(
        template.format(wheel=filename,
                        wheel_html_escaped=filename.replace("+", "%2B")))
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.764
  - name: "exact_match,flexible-extract"
    value: 0.764
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.356
  - name: "exact_match,flexible-extract"
    value: 0.358
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -0,0 +1,11 @@
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.6353
  - name: "exact_match,flexible-extract"
    value: 0.637
 limit: null
 num_fewshot: null 
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,6 +1,7 @@
 Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Minitron-4B-Base-FP8.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+#   pip install lm-eval==0.4.4
 usage() {
    echo``
@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
 done
 lm_eval --model hf \
-  --model_args pretrained=$MODEL,parallelize=True \
+  --model_args "pretrained=$MODEL,parallelize=True" \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
-  --batch_size $BATCH_SIZE
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.3
+#   pip install lm-eval==0.4.4
 usage() {
    echo``
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
-  --batch_size $BATCH_SIZE
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
 done
 # Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,3 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
@ -49,10 +50,15 @@ def test_lm_eval_correctness():
    results = launch_lm_eval(eval_config)
    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(f'{task["name"]} | {metric["name"]}: '
                  f'ground_truth={ground_truth} | measured={measured_value}')
-            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and numpy.isclose(
                ground_truth, measured_value, rtol=RTOL)
    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -1,16 +1,14 @@
 # vLLM benchmark suite
 ## Introduction
 This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 ## Performance benchmark quick overview
 **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
@ -19,7 +17,6 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
 ## Nightly benchmark quick overview
 **Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
@ -28,8 +25,6 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
 **Benchmarking Duration**: about 3.5hrs.
 ## Trigger the benchmark
 Performance benchmark will be triggered when:
@ -39,16 +34,11 @@ Performance benchmark will be triggered when:
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 ## Performance benchmark details
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
-
+### Latency test
 #### Latency test
 Here is an example of one test inside `latency-tests.json`:
@ -68,6 +58,7 @@ Here is an example of one test inside `latency-tests.json`:
 ```
 In this example:
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
 - The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
@ -75,16 +66,17 @@ Note that the performance numbers are highly sensitive to the value of the param
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
 ### Throughput test
 #### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
-#### Serving test
+### Serving test
 We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
-```
+```json
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
@ -109,6 +101,7 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
 ```
 Inside this example:
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
 - The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
@ -118,36 +111,33 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
-#### Visualizing the results
+### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 ## Nightly test details
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
-
+### Workflow
 #### Workflow
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
 - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
 - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
 - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
-#### Nightly tests
+### Nightly tests
 In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
-#### Docker containers
+### Docker containers
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -1,5 +1,6 @@
 steps:
  - label: "Wait for container to be ready"
    key: wait-for-container-image
    agents:
      queue: A100
    plugins:
@ -9,10 +10,102 @@ steps:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - wait
+  - label: "Cleanup H100"
    agents:
      queue: H100
    depends_on: ~
    command: docker system prune -a --volumes --force
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
            volumeMounts:
            - name: devshm
              mountPath: /dev/shm
            env:
            - name: VLLM_USAGE_SOURCE
              value: ci-test
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
          nodeSelector:
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
          volumes:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  # Premerge benchmark
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - kubernetes:
        podSpec:
@ -41,20 +134,51 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
  # - label: "H100"
  #   agents:
  #     queue: H100
  #   plugins:
  #   - docker#v5.11.0:
  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
  #       command:
  #       - bash
  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
  #       mount-buildkite-agent: true
  #       propagate-environment: true
  #       ipc: host
  #       gpus: all
  #       environment:
  #       - VLLM_USAGE_SOURCE
  #       - HF_TOKEN
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -0,0 +1,27 @@
 ## Description
 This file contains the downloading link for benchmarking results.
 - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 - [benchmarking results](artifact://results.zip)
 - [benchmarking code](artifact://nightly-benchmarks.zip)
 Please download the visualization scripts in the post
 ## Results reproduction
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
  - Download `nightly-benchmarks.zip`.
  - In the same folder, run the following code:
  ```console
  export HF_TOKEN=<your HF token>
  apt update
  apt install -y git
  unzip nightly-benchmarks.zip
  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  ```
 And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,45 +1,39 @@
 # Nightly benchmark
-The main goal of this benchmarking is two-fold:
+This benchmark aims to:
 - Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
 - Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
 - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
 - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
-## Docker images
+Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 - vllm/vllm-openai:v0.5.0.post1
 - nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
 - openmmlab/lmdeploy:v0.5.0
 - ghcr.io/huggingface/text-generation-inference:2.1
-<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
+## Setup
-
+- Docker images:
-## Hardware
+  - vLLM: `vllm/vllm-openai:v0.6.2`
-
+  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-One AWS node with 8x NVIDIA A100 GPUs.
+  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-
+  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-
+    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
-## Workload description
+  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
-
+- Hardware
-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+  - 8x Nvidia A100 GPUs
-
+- Workload:
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+  - Dataset
- Output length: the corresponding output length of these 500 prompts.
+    - ShareGPT dataset
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
  - Models: llama-3 8B, llama-3 70B.
    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
-<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
+## Known issues
-## Plots
+- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
-
+- TGI does not support `ignore-eos` flag.
 In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
 <img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
 ## Results
 {nightly_results_benchmarking_table}
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec
 common_container_settings: &common_container_settings
  command:
-    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  resources:
    limits:
      nvidia.com/gpu: 8
@ -37,7 +37,10 @@ common_container_settings: &common_container_settings
 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-  - label: "A100 trt benchmark"
+
  - label: "A100 vllm step 10"
    priority: 100
    agents:
      queue: A100
@ -46,7 +49,21 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+              - image: vllm/vllm-openai:v0.6.2
                <<: *common_container_settings
  - label: "A100 sglang benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: lmsysorg/sglang:v0.3.2-cu121
                <<: *common_container_settings
  - label: "A100 lmdeploy benchmark"
@ -58,11 +75,13 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: openmmlab/lmdeploy:v0.5.0
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
                <<: *common_container_settings
-  - label: "A100 vllm benchmark"
+
  - label: "A100 trt llama-8B"
    priority: 100
    agents:
      queue: A100
@ -71,10 +90,25 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: vllm/vllm-openai:latest 
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama8B"
-  - label: "A100 tgi benchmark"
+
  - label: "A100 trt llama-70B"
    priority: 100
    agents:
      queue: A100
@ -83,12 +117,54 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: ghcr.io/huggingface/text-generation-inference:2.1 
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama70B"
  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
  # - label: "A100 trt benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
  #               <<: *common_container_settings
  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
  # - label: "A100 tgi benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
  #               <<: *common_container_settings
  - wait
-  - label: "Plot"
+  - label: "Collect the results"
    priority: 100
    agents:
      queue: A100
@ -117,4 +193,4 @@ steps:
                    name: hf-token-secret
                    key: token
-  - wait
+  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -7,10 +7,8 @@
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 {latency_tests_markdown_table}
 ## Throughput tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -19,10 +17,8 @@
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 {throughput_tests_markdown_table}
 ## Serving tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -33,10 +29,8 @@
 - We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 {serving_tests_markdown_table}
 ## json version of the benchmarking tables
 This section contains the data of the markdown tables above in JSON format.
@ -54,9 +48,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
 ```
 The json string for all benchmarking tables:
 ```json
 {benchmarking_results_in_json_string}
 ```
 You can also check the raw experiment data in the Artifact tab of the Buildkite page.
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@ -1,76 +0,0 @@
 #!/bin/bash
 set -o pipefail
 set -x
 check_gpus() {
    # check the number of GPUs and GPU type.
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
    if [[ $gpu_count -gt 0 ]]; then
        echo "GPU found."
    else
        echo "Need at least 1 GPU to run benchmarking."
        exit 1
    fi
    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
    echo "GPU type is $gpu_type"
 }
 check_hf_token() {
    # check if HF_TOKEN is available and valid
    if [[ -z "$HF_TOKEN" ]]; then
        echo "Error: HF_TOKEN is not set."
        exit 1
    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
        echo "Error: HF_TOKEN does not start with 'hf_'."
        exit 1
    else
        echo "HF_TOKEN is set and valid."
    fi
 }
 main() {
    check_gpus
    check_hf_token
    df -h
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
    # run lmdeploy
    if which lmdeploy >/dev/null; then
        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
        exit 0
    fi
    # run tgi
    if [ -e /tgi-entrypoint.sh ]; then
        echo "tgi is available, redirect to run-tgi-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
        exit 0
    fi
    # run trt
    if which trtllm-build >/dev/null; then
        echo "trtllm is available, redirect to run-trt-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
        exit 0
    fi
    # run vllm
    if [ -e /vllm-workspace ]; then
        echo "vllm is available, redirect to run-vllm-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
        exit 0
    fi
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,3 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
 import os
 from pathlib import Path
@ -56,7 +58,7 @@ serving_column_mapping = {
 def read_markdown(file):
    if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
@ -75,14 +77,14 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@ -97,7 +99,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@ -119,7 +121,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@ -157,6 +159,18 @@ if __name__ == "__main__":
                                             throughput_results,
                                             serving_results)
    for df in [latency_results, serving_results, throughput_results]:
        if df.empty:
            continue
        # Sort all dataframes by their respective "Test name" columns
        df.sort_values(by="Test name", inplace=True)
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
    # get markdown tables
    latency_md_table = tabulate(latency_results,
                                headers='keys',
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,3 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 from transformers import AutoTokenizer
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -0,0 +1,97 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 import json
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
        description=
        'Parse command line arguments for summary-nightly-results script.')
    parser.add_argument('--results-folder',
                        type=str,
                        required=True,
                        help='The folder where the results are stored.')
    parser.add_argument('--description',
                        type=str,
                        required=True,
                        help='Description of the results.')
    args = parser.parse_args()
    return args
 def get_perf(df, method, model, metric):
    means = []
    for qps in [2, 4, 8, 16, "inf"]:
        target = df['Test name'].str.contains(model)
        target = target & df['Engine'].str.contains(method)
        target = target & df['Test name'].str.contains("qps_" + str(qps))
        filtered_df = df[target]
        if filtered_df.empty:
            means.append(0.)
        else:
            means.append(filtered_df[metric].values[0])
    return np.array(means)
 def get_perf_w_std(df, method, model, metric):
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
        std = get_perf(df, method, model, "Std " + metric + " (ms)")
        if std.mean() == 0:
            std = None
        success = get_perf(df, method, model, "Successful req.")
        if std is not None:
            std = std / np.sqrt(success)
            std = std.tolist()
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
            df, method, model, "Output Tput (tok/s)")
        mean = mean.tolist()
        std = None
    return mean, std
 def main(args):
    results_folder = Path(args.results_folder)
    results = []
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
        with open(test_file) as f:
            results = results + json.loads(f.read())
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
    with open(args.description) as f:
        description = f.read()
    description = description.format(
        nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
 if __name__ == '__main__':
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,3 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 from lmdeploy.serve.openai.api_client import APIClient
 api_client = APIClient("http://localhost:8000")
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -0,0 +1,228 @@
 #!/bin/bash
 # Currently FP8 benchmark is NOT enabled.
 set -x
 server_params=$1
 common_params=$2
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 launch_trt_server() {
  model_path=$(echo "$common_params" | jq -r '.model')
  model_name="${model_path#*/}"
  model_type=$(echo "$server_params" | jq -r '.model_type')
  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
  model_tp_size=$(echo "$common_params" | jq -r '.tp')
  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
  # create model caching directory
  cd ~
  rm -rf models
  mkdir -p models
  cd models
  models_dir=$(pwd)
  trt_model_path=${models_dir}/${model_name}-trt-ckpt
  trt_engine_path=${models_dir}/${model_name}-trt-engine
  # clone tensorrt backend
  cd /
  rm -rf tensorrtllm_backend
  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
  git lfs install
  cd tensorrtllm_backend
  git checkout "$trt_llm_version"
  git submodule update --init --recursive
  # build trtllm engine
  cd /tensorrtllm_backend
  cd "./tensorrt_llm/examples/${model_type}"
  python3 convert_checkpoint.py \
    --model_dir "${model_path}" \
    --dtype "${model_dtype}" \
    --tp_size "${model_tp_size}" \
    --output_dir "${trt_model_path}"
  trtllm-build \
    --checkpoint_dir "${trt_model_path}" \
    --use_fused_mlp \
    --reduce_fusion disable \
    --workers 8 \
    --gpt_attention_plugin "${model_dtype}" \
    --gemm_plugin "${model_dtype}" \
    --tp_size "${model_tp_size}" \
    --max_batch_size "${max_batch_size}" \
    --max_input_len "${max_input_len}" \
    --max_seq_len "${max_seq_len}" \
    --max_num_tokens "${max_num_tokens}" \
    --output_dir "${trt_engine_path}"
  # handle triton protobuf files and launch triton server
  cd /tensorrtllm_backend
  mkdir triton_model_repo
  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
  cd triton_model_repo
  rm -rf ./tensorrt_llm/1/*
  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
  cd /tensorrtllm_backend
  python3 scripts/launch_triton_server.py \
    --world_size="${model_tp_size}" \
    --model_repo=/tensorrtllm_backend/triton_model_repo &
 }
 launch_tgi_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params."
    server_command="/tgi-entrypoint.sh \
                --model-id $model \
                --num-shard $tp \
                --port $port \
                --quantize fp8 \
                $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="/tgi-entrypoint.sh \
                --model-id $model \
                --num-shard $tp \
                --port $port \
                $server_args"
  fi
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 launch_lmdeploy_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  server_command="lmdeploy serve api_server $model \
    --tp $tp \
    --server-port $port \
    $server_args"
  # run the server
  echo "Server command: $server_command"
  bash -c "$server_command" &
 }
 launch_sglang_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
    server_command="python3 \
        -m sglang.launch_server \
        --tp $tp \
        --model-path $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="python3 \
        -m sglang.launch_server \
        --tp $tp \
        --model-path $model \
        --port $port \
        $server_args"
  fi
  # run the server
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 launch_vllm_server() {
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  fi
  # run the server
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 main() {
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
    launch_trt_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
    launch_tgi_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
    launch_lmdeploy_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
    launch_sglang_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
    launch_vllm_server
  fi
 }
 main
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@ -1,102 +0,0 @@
 #!/bin/bash
 server_params=$1
 common_params=$2
 model_path=$(echo "$common_params" | jq -r '.model')
 model_name="${model_path#*/}"
 model_type=$(echo "$server_params" | jq -r '.model_type')
 model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
 model_tp_size=$(echo "$common_params" | jq -r '.tp')
 max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
 max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
 max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
 trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
 cd ~
 rm -rf models
 mkdir -p models
 cd models
 models_dir=$(pwd)
 trt_model_path=${models_dir}/${model_name}-trt-ckpt
 trt_engine_path=${models_dir}/${model_name}-trt-engine
 cd ~
 rm -rf tensorrt-demo
 git clone https://github.com/neuralmagic/tensorrt-demo.git
 cd tensorrt-demo
 tensorrt_demo_dir=$(pwd)
 # make sure the parameter inside tensorrt_demo is consistent to envvar
 sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
 sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
 cd /
 rm -rf tensorrtllm_backend
 git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
 git lfs install
 cd tensorrtllm_backend
 git checkout $trt_llm_version
 tensorrtllm_backend_dir=$(pwd)
 git submodule update --init --recursive
 cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
 cd /tensorrtllm_backend
 cd ./tensorrt_llm/examples/${model_type}
 if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
    python ../quantization/quantize.py \
        --model_dir ${model_path} \
        --dtype ${model_dtype} \
        --tp_size ${model_tp_size} \
        --output_dir ${trt_model_path} \
        --qformat fp8 \
        --kv_cache_dtype fp8 \
        --calib_size 2
 else
    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
    python3 convert_checkpoint.py \
        --model_dir ${model_path} \
        --dtype ${model_dtype} \
        --tp_size ${model_tp_size} \
        --output_dir ${trt_model_path}
 fi
 trtllm-build \
 --checkpoint_dir=${trt_model_path} \
 --gpt_attention_plugin=${model_dtype} \
 --gemm_plugin=${model_dtype} \
 --remove_input_padding=enable \
 --paged_kv_cache=enable \
 --tp_size=${model_tp_size} \
 --max_batch_size=${max_batch_size} \
 --max_input_len=${max_input_len} \
 --max_output_len=${max_output_len} \
 --max_num_tokens=${max_output_len} \
 --opt_num_tokens=${max_output_len} \
 --output_dir=${trt_engine_path} 
 cd /tensorrtllm_backend/triton_model_repo
 rm -rf ./tensorrt_llm/1/*
 cp -r ${trt_engine_path}/* ./tensorrt_llm/1
 cd /tensorrtllm_backend
 python3 scripts/launch_triton_server.py \
 --world_size=${model_tp_size} \
 --model_repo=/tensorrtllm_backend/triton_model_repo &
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -8,6 +8,7 @@ main() {
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
    (which zip) || (apt-get install -y zip)
    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
@ -15,26 +16,63 @@ main() {
    fi
    # initial annotation
-    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
    # download results
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/
-    # generate figures
+    # upload benchmark results
-    python3 -m pip install tabulate pandas matplotlib
+    zip -r results.zip results/
-    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    /workspace/buildkite-agent artifact upload "results.zip"
        --description $description \
        --results-folder results/
-    # upload results and figures
+    # upload benchmarking scripts
-    /workspace/buildkite-agent artifact upload "nightly_results.png"
+    cd "$VLLM_SOURCE_CODE_LOC/"
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    # upload benchmarking pipeline
    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
    # The figures should be generated by a separate process outside the CI/CD pipeline
    # # generate figures
    # python3 -m pip install tabulate pandas matplotlib
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
    #     --description $description \
    #     --results-folder results/ 
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sharegpt
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_2048_128
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_128_2048
    # # upload results and figures
    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@ -1,135 +0,0 @@
 import argparse
 import json
 import math
 from pathlib import Path
 import matplotlib.pyplot as plt
 import pandas as pd
 from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
        description=
        'Parse command line arguments for summary-nightly-results script.')
    parser.add_argument('--results-folder',
                        type=str,
                        required=True,
                        help='The folder where the results are stored.')
    parser.add_argument('--description',
                        type=str,
                        required=True,
                        help='Description of the results.')
    args = parser.parse_args()
    return args
 def main(args):
    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
    results_folder = Path(args.results_folder)
    results = []
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
        with open(test_file, "r") as f:
            results = results + json.loads(f.read())
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
    with open(args.description, "r") as f:
        description = f.read()
    description = description.format(
        nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
    plt.rcParams.update({'font.size': 20})
    # plot results
    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
    fig.subplots_adjust(hspace=1)
    methods = ["vllm", "trt", "lmdeploy", "tgi"]
    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
        for j, metric in enumerate(["TTFT", "ITL"]):
            means, stds = [], []
            for method in methods:
                target = df['Test name'].str.contains(model)
                target = target & df['Engine'].str.contains(method)
                filtered_df = df[target]
                if filtered_df.empty:
                    means.append(0.)
                    stds.append(0.)
                else:
                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
                    std = filtered_df[f"Std {metric} (ms)"].values[0]
                    success = filtered_df["Successful req."].values[0]
                    stds.append(std / math.sqrt(success))
            print(model, metric)
            print(means, stds)
            ax = axes[i, j + 1]
            bars = ax.bar(
                ["vllm", "trt", "lmdeploy", "tgi"],
                means,
                yerr=stds,
                capsize=10,
            )
            for idx, bar in enumerate(bars):
                bar.set_color(bar_colors[idx])
            ax.set_ylim(bottom=0)
            ax.set_ylabel(f"{metric} (ms)")
            ax.set_title(f"{model} {metric}")
            ax.grid(axis='y')
        metric = "Tput"
        j = 0
        if True:
            tputs = []
            for method in methods:
                target = df['Test name'].str.contains(model)
                target = target & df['Engine'].str.contains(method)
                filtered_df = df[target]
                if filtered_df.empty:
                    tputs.append(0.)
                else:
                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
                    tputs.append(input_tput + output_tput)
            print(model, metric)
            print(tputs)
            ax = axes[i, j]
            bars = ax.bar(
                ["vllm", "trt", "lmdeploy", "tgi"],
                tputs,
            )
            for idx, bar in enumerate(bars):
                bar.set_color(bar_colors[idx])
            ax.set_ylim(bottom=0)
            ax.set_ylabel("Tput (token/s)")
            ax.set_title(f"{model} {metric}")
            ax.grid(axis='y')
    fig.tight_layout()
    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
 if __name__ == '__main__':
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@ -1,218 +0,0 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  pkill lmdeploy || true
  # waiting for GPU processes to be fully killed
  sleep 10
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append lmdeploy to the test name
    test_name=lmdeploy_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    # prepare tokenizer
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    server_command="lmdeploy serve api_server $model \
      --tp $tp \
      --server-port $port \
      $server_args"
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    bash -c "$server_command" &
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "lmdeploy server is up and running."
    else
      echo ""
      echo "lmdeploy failed to start within the timeout period."
      break
    fi
    # get model name
    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend lmdeploy \
        --tokenizer /tokenizer_cache \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        --model \"$model_name\" \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "lmdeploy" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  python -m pip install transformers==4.41.2
  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python -m pip install tabulate pandas
  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -0,0 +1,462 @@
 #!/bin/bash
 set -o pipefail
 set -x
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
  echo "GPU type is $gpu_type"
 }
 check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: HF_TOKEN is not set."
    exit 1
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
    echo "Error: HF_TOKEN does not start with 'hf_'."
    exit 1
  else
    echo "HF_TOKEN is set and valid."
  fi
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 get_current_llm_serving_engine() {
  if which lmdeploy >/dev/null; then
    echo "Container: lmdeploy"
    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
    return
  fi
  if [ -e /tgi-entrypoint.sh ]; then
    echo "Container: tgi"
    export CURRENT_LLM_SERVING_ENGINE=tgi
    return
  fi
  if which trtllm-build >/dev/null; then
    echo "Container: tensorrt-llm"
    export CURRENT_LLM_SERVING_ENGINE=trt
    return
  fi
  if [ -e /sgl-workspace ]; then
    echo "Container: sglang"
    export CURRENT_LLM_SERVING_ENGINE=sglang
    return
  fi
  if [ -e /vllm-workspace ]; then
    echo "Container: vllm"
    # move to a completely irrelevant directory, to avoid import vllm from current folder
    export CURRENT_LLM_SERVING_ENGINE=vllm
    return
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 kill_gpu_processes() {
  pkill -f python
  pkill -f python3
  pkill -f tritonserver
  pkill -f pt_main_thread
  pkill -f text-generation
  pkill -f lmdeploy
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
  done
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 ensure_installed() {
  # Ensure that the given command is installed by apt-get
  local cmd=$1
  if ! which "$cmd" >/dev/null; then
    apt-get update && apt-get install -y "$cmd"
  fi
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # prepare tokenizer
    # this is required for lmdeploy.
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    # change model name for lmdeploy (it will not follow standard hf name)
    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ $backend = "trt" ]]; then
        backend="tensorrt-llm"
      fi
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      if [[ "$dataset_name" = "sharegpt" ]]; then
        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      elif [[ "$dataset_name" = "sonnet" ]]; then
        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --sonnet-input-len $sonnet_input_len \
          --sonnet-output-len $sonnet_output_len \
          --sonnet-prefix-len $sonnet_prefix_len \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      else
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
        exit 1
      fi
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      server_command="None"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
  done
  kill_gpu_processes
 }
 run_genai_perf_tests() {
  # run genai-perf tests 
  # $1: a json file specifying genai-perf test cases
  local genai_perf_test_file
  genai_perf_test_file=$1
  # Iterate over genai-perf tests
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')    
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps=$num_prompts
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      #TODO: add output dir.
      client_command="genai-perf profile \
        -m $model \
        --service-kind openai \
        --backend vllm \
        --endpoint-type chat \
        --streaming \
        --url localhost:$port \
        --request-rate $qps \
        --num-prompts $num_prompts \
      "
    echo "Client command: $client_command"
    eval "$client_command"
    #TODO: process/record outputs
    done
  done
  kill_gpu_processes
 }
 prepare_dataset() {
  # download sharegpt dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  echo "" > sonnet_4x.txt
  for _ in {1..4}
  do
    cat sonnet.txt >> sonnet_4x.txt
  done
 }
 main() {
  # check if the environment variable is successfully injected from yaml
  check_gpus
  check_hf_token
  get_current_llm_serving_engine
  pip install -U transformers
  pip install -r requirements-dev.txt
  which genai-perf
  # check storage
  df -h
  ensure_installed wget
  ensure_installed curl
  ensure_installed jq
  # genai-perf dependency
  ensure_installed libb64-0d
  prepare_dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
  # run the test
  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
  # run genai-perf tests
  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
  mv artifacts/ $RESULTS_FOLDER/
  # upload benchmark results to buildkite
  python3 -m pip install tabulate pandas
  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -6,6 +6,7 @@
 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
 set -x
 set -o pipefail
 check_gpus() {
@ -17,7 +18,7 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
@ -85,15 +86,11 @@ kill_gpu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
+  pgrep python3 | xargs -r kill -9
  # this line doesn't work now
  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
  pkill -f python3
  pkill -f /usr/bin/python3
  # wait until GPU memory usage smaller than 1GB
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
  done
@ -117,7 +114,7 @@ upload_to_buildkite() {
  fi
  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
@ -150,7 +147,7 @@ run_latency_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
@ -206,9 +203,9 @@ run_throughput_tests() {
    throughput_args=$(json2args "$throughput_params")
    # check if there is enough GPU to run the test
-    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
@ -270,7 +267,7 @@ run_serving_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
@ -278,7 +275,7 @@ run_serving_tests() {
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $testname."
+      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
@ -289,12 +286,11 @@ run_serving_tests() {
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
-    eval "$server_command" &
+    bash -c "$server_command" &
    server_pid=$!
    # wait until the server is alive
-    wait_for_server
+    if wait_for_server; then
    if [ $? -eq 0 ]; then
      echo ""
      echo "vllm server is up and running."
    else
@ -323,7 +319,7 @@ run_serving_tests() {
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
-      eval "$client_command"
+      bash -c "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
@ -349,6 +345,11 @@ main() {
  check_gpus
  check_hf_token
  # Set to v1 to run v1 benchmark
  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
    export VLLM_USE_V1=1
  fi
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@ -1,216 +0,0 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  pkill text-generation || true
  # waiting for GPU processes to be fully killed
  sleep 10
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  timeout 1200 bash -c '
    until curl -s localhost:8000/generate_stream > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append tgi to the test name
    test_name=tgi_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
      echo "Key 'fp8' exists in common params."
      server_command="/tgi-entrypoint.sh \
        --model-id $model \
        --num-shard $tp \
        --port $port \
        --quantize fp8 \
        $server_args"
    else
      echo "Key 'fp8' does not exist in common params."
      server_command="/tgi-entrypoint.sh \
        --model-id $model \
        --num-shard $tp \
        --port $port \
        $server_args"
    fi
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    eval "$server_command" &
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "tgi server is up and running."
    else
      echo ""
      echo "tgi failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend tgi \
        --model $model \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "tgi" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  export CURRENT_LLM_SERVING_ENGINE=tgi
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python -m pip install tabulate pandas
  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@ -1,214 +0,0 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  pkill tritonserver || true
  # waiting for GPU processes to be fully killed
  sleep 20
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  timeout 1200 bash -c '
    until curl -s localhost:8000/generate_stream > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append trt to the test name
    test_name=trt_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    echo "Running test case $test_name"
    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "trt server is up and running."
    else
      echo ""
      echo "trt failed to start within the timeout period."
      break
    fi
    # prepare tokenizer
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend tensorrt-llm \
        --tokenizer /tokenizer_cache \
        --model $model \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      server_command=""
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "trt" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  # update transformers package, to make sure mixtral tokenizer is available
  python -m pip install transformers -U
  export CURRENT_LLM_SERVING_ENGINE=trt
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python -m pip install tabulate pandas
  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@ -1,221 +0,0 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  # kill all processes on GPU.
  pkill pt_main_thread
  sleep 10
  # remove vllm config file
  rm -rf ~/.config/vllm
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append vllm to the test name
    test_name=vllm_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
      server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
    else
      echo "Key 'fp8' does not exist in common params."
      server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
    fi
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    eval "$server_command" &
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "vllm server is up and running."
    else
      echo ""
      echo "vllm failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend vllm \
        --model $model \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "vllm" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  export CURRENT_LLM_SERVING_ENGINE=vllm
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python3 -m pip install tabulate pandas
  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,3 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import datetime
 import json
 import os
@ -17,10 +19,17 @@ serving_column_mapping = {
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
-    "input_throughput": "Input Tput (tok/s)",
+    "median_itl_ms": "Median ITL (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "std_tpot_ms": "Std TPOT (ms)",
    "median_tpot_ms": "Median TPOT (ms)",
    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
    "total_input_tokens": "Total input tokens",
    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }
@ -29,11 +38,11 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,12 +1,16 @@
 #!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
 if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 else
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 fi
 TIMEOUT_SECONDS=10
 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
        exit 0
    fi
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@ -0,0 +1,23 @@
 [
    {
        "test_name": "llama8B_tp1_genai_perf",
        "qps_list": [4,8,16,32],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "port": 8000,
            "num_prompts": 500,
            "reuse_server": false
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "genai_perf_input_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -1,16 +1,18 @@
 [
    {
-        "test_name": "llama8B_tp1",
+        "test_name": "llama8B_tp1_sharegpt",
-        "qps_list": [4],
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -21,34 +23,158 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
+            "model_dtype": "bfloat16",
-            "max_batch_size": 256,
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
+            "max_seq_len": 6144,
-            "trt_llm_version": "r24.04"
+            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "llama70B_tp4",
+        "test_name": "llama8B_tp1_sonnet_512_16",
-        "qps_list": [2],
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 16,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama8B_tp1_sonnet_512_256",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 256,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sharegpt",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -59,34 +185,50 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
+            "model_dtype": "bfloat16",
-            "max_batch_size": 256,
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
+            "max_seq_len": 6144,
-            "trt_llm_version": "r24.04"
+            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "mixtral8x7B_tp2",
+        "test_name": "llama70B_tp4_sonnet_512_16",
-        "qps_list": [2],
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 2,
+            "tp": 4,
-            "dataset_name": "sharegpt",
+            "dataset_name": "sonnet",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 16,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -97,20 +239,85 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
+            "model_dtype": "bfloat16",
-            "max_batch_size": 256,
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
+            "max_seq_len": 6144,
-            "trt_llm_version": "r24.04"
+            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sonnet_512_256",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 256,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -66,8 +66,7 @@
            "swap_space": 16, 
            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1,
+            "speculative_draft_tensor_parallel_size": 1
            "use_v2_block_manager": ""
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,32 +1,77 @@
 steps:
  - label: "Build wheel - CUDA 12.1"
    agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
+      - "bash .buildkite/upload-wheels.sh"
      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
    env:
      DOCKER_BUILDKIT: "1"
-  - block: "Build CUDA 11.8 wheel"
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-    key: block-build-cu118-wheel
+  # However, this block can be uncommented to save some compute hours.
  # - block: "Build CUDA 11.8 wheel"
  #   key: block-build-cu118-wheel
  - label: "Build wheel - CUDA 11.8"
-    depends_on: block-build-cu118-wheel
+    # depends_on: block-build-cu118-wheel
    agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
+      - "bash .buildkite/upload-wheels.sh"
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
+    env:
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      DOCKER_BUILDKIT: "1"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+
  - block: "Build release image"
    depends_on: ~
    key: block-release-image-build
  - label: "Build release image"
    depends_on: block-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Build and publish TPU release image"
    depends_on: ~
    if: build.env("NIGHTLY") == "1"
    agents:
      queue: tpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
      - docker-login#v3.0.0:
          username: vllm
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
  - input: "Provide Release version here"
    fields:
      - text: "What is the release version?"
        key: "release-version"
  - block: "Build CPU release image"
    key: block-cpu-release-image-build
    depends_on: ~
  - label: "Build and publish CPU release image"
    depends_on: block-cpu-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
@ -31,8 +33,8 @@ cleanup_docker() {
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
-    # Remove unused volumes
+    # Remove unused volumes / force the system prune for old images as well.
-    docker volume prune -f
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
@ -57,17 +59,17 @@ done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull ${image_name}
+docker pull "${image_name}"
 remove_docker_container() {
-   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p ${HF_CACHE}
+mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 commands=$@
@ -83,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_encoder_decoder_attn.py \
  --ignore=kernels/test_flash_attn.py \
  --ignore=kernels/test_flashinfer.py \
  --ignore=kernels/test_gguf.py \
  --ignore=kernels/test_int8_quant.py \
  --ignore=kernels/test_machete_gemm.py \
  --ignore=kernels/test_mamba_ssm.py \
@ -107,35 +108,38 @@ fi
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+  # assign job count as the number of shards used   
    #replace shard arguments
    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-    echo "Shard ${GPU} commands:$commands"
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
-        -e HIP_VISIBLE_DEVICES=${GPU} \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
-        -v ${HF_CACHE}:${HF_MOUNT} \
+        -e AWS_ACCESS_KEY_ID \
-        -e HF_HOME=${HF_MOUNT} \
+        -e AWS_SECRET_ACCESS_KEY \
-        --name ${container_name}_${GPU}  \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
-        ${image_name} \
+        -e "HF_HOME=${HF_MOUNT}" \
-        /bin/bash -c "${commands}" \
+        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done
  #wait for all processes to finish and collect exit codes
-  for pid in ${PIDS[@]}; do
+  for pid in "${PIDS[@]}"; do
-    wait ${pid}
+    wait "${pid}"
    STATUS+=($?)
  done
-  for st in ${STATUS[@]}; do
+  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]]; then
      echo "One of the processes failed with $st"
-      exit ${st}
+      exit "${st}"
    fi
  done
 else
@ -146,9 +150,11 @@ else
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
-          -v ${HF_CACHE}:${HF_MOUNT} \
+          -e AWS_ACCESS_KEY_ID \
-          -e HF_HOME=${HF_MOUNT} \
+          -e AWS_SECRET_ACCESS_KEY \
-          --name ${container_name} \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
-          ${image_name} \
+          -e "HF_HOME=${HF_MOUNT}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite
 set -ex
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@ -1,33 +1,14 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.ppc64le .
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container
-# Run the image, setting --shm-size=4g for tensor parallel.
+# Try building the docker image
-source /etc/environment
+docker build -t cpu-test -f Dockerfile.ppc64le .
 #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
 # Run basic model test
 docker exec cpu-test bash -c "
  pip install pytest matplotlib einops transformers_stream_generator
  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # online inference
 docker exec cpu-test bash -c "
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
  python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name random \
    --model facebook/opt-125m \
    --num-prompts 20 \
    --endpoint /v1/completions \
    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -1,44 +1,72 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 # Try building the docker image
-numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
-numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
  # offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
-docker exec cpu-test bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+    set -e
-  pytest -v -s tests/models/decoder_only/language \
+    pip install -r vllm/requirements-test.txt
-    --ignore=tests/models/test_fp8.py \
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
+    pytest -v -s tests/models/embedding/language -m cpu_model
-    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
  # Run compressed-tensor test
-docker exec cpu-test bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
-# online inference
+  # Run AWQ test
-docker exec cpu-test bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_ipex_quant.py"
  # Run chunked-prefill and prefix-cache test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v -k cpu_model \
    tests/basic_correctness/test_chunked_prefill.py"  
  # online serving
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    export VLLM_CPU_KVCACHE_SPACE=10 
-  export VLLM_CPU_OMP_THREADS_BIND=48-92 
+    export VLLM_CPU_OMP_THREADS_BIND=$1
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
    python3 benchmarks/benchmark_serving.py \
      --backend vllm \
@ -47,3 +75,14 @@ docker exec cpu-test bash -c "
      --num-prompts 20 \
      --endpoint /v1/completions \
      --tokenizer facebook/opt-125m"
  # Run multi-lora tests
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/lora/test_qwen2vl.py"
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
 timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -0,0 +1,28 @@
 #!/bin/bash
 # This script build the GH200 docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
 python3 use_existing_torch.py
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
 # Setup cleanup
 remove_docker_container() { docker rm -f gh200-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@ -0,0 +1,24 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t hpu-test-env -f Dockerfile.hpu .
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
 # override the exit code of the script, so we need to use
 # separate remove_docker_container and remove_docker_container_and_exit
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
 remove_docker_container() { docker rm -f hpu-test || true; }
 remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
 trap remove_docker_container_and_exit EXIT
 remove_docker_container
 # Run the image and launch offline inference
 docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 EXITCODE=$?
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
 shift 4
 COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
@ -23,7 +23,7 @@ fi
 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
-    echo $command
+    echo "$command"
 done
 start_network() {
@ -36,7 +36,7 @@ start_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
@ -49,17 +49,20 @@ start_nodes() {
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
            /bin/bash -c "tail -f /dev/null"
        # organize containers into a ray cluster
-        if [ $node -eq 0 ]; then
+        if [ "$node" -eq 0 ]; then
            # start the ray head node
-            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
-            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
        fi
    done
@ -79,22 +82,22 @@ run_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -ne 0 ]; then
+        if [ "$node" -ne 0 ]; then
-            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        else
-            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        fi
    done
 }
 cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
-        docker stop node$node
+        docker stop "node$node"
    done
    docker network rm docker-net
 }
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -1,6 +1,20 @@
 #!/bin/bash
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
 set -v
 image_name="neuron/vllm-ci"
 container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@ -11,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
-        docker system prune -f
+        # Remove dangling images (those that are not tagged and not used by any container)
-        echo $current_time > /tmp/neuron-docker-build-timestamp
+        docker image prune -f
        # Remove unused volumes / force the system prune for old images as well.
        docker volume prune -f && docker system prune -f
        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
-    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
-docker build -t neuron -f Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .
 # Setup cleanup
-remove_docker_container() { docker rm -f neuron || true; }
+remove_docker_container() {
    docker image rm -f "${image_name}" || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image
-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+       -v "${HF_CACHE}:${HF_MOUNT}" \
-
+       -e "HF_HOME=${HF_MOUNT}" \
-# Wait for the server to start
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-wait_for_server_to_start() {
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
-    timeout=300
+       --name "${container_name}" \
-    counter=0
+       ${image_name} \
-
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
        sleep 1
        counter=$((counter + 1))
        if [ $counter -ge $timeout ]; then
            echo "Timeout after $timeout seconds"
            break
        fi
    done
 }
 wait_for_server_to_start
 # Test a simple prompt
 curl -X POST -H "Content-Type: application/json" \
    localhost:8000/generate \
    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 # This script build the OpenVINO docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -11,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 # Build the docker image.
@ -12,4 +14,13 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -10,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
-# Run the image and launch offline inference
+# Run the image and test offline inference/tensor parallel
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -9,6 +9,7 @@
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@ -37,9 +38,9 @@ steps:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
+  - grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 15min
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
  fast_check: true
  source_file_dependencies:
  - vllm/
@ -49,7 +50,9 @@ steps:
  - tests/multimodal
  - tests/test_utils
  - tests/worker
  - tests/standalone_tests/lazy_imports.py
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@ -58,18 +61,35 @@ steps:
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker
 - label: Python-only Installation Test
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
  commands:
  - bash standalone_tests/python_only_compile.sh
 - label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
  source_file_dependencies:
  - vllm/
-  - tests/basic_correctness
+  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_preemption
  - tests/basic_correctness/test_cumem.py
  commands:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Chunked Prefill Test
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Core Test # 10min
  mirror_hardwares: [amd]
@ -81,21 +101,23 @@ steps:
  commands:
  - pytest -v -s core
- label: Entrypoints Test # 20min
+- label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  - tests/entrypoints/offline_mode
  commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -106,11 +128,21 @@ steps:
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
-  - tests/distributed
+  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  commands:
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - python3 ../examples/offline_inference/rlhf.py
  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
 - label: Metrics, Tracing Test # 10min
  num_gpus: 2
@ -136,7 +168,9 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/test_regression
-  command: pytest -v -s test_regression.py
+  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
 - label: Engine Test # 10min
@ -145,42 +179,70 @@ steps:
  - vllm/
  - tests/engine
  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
- label: Examples Test # 12min
+- label: V1 Test
  #mirror_hardwares: [amd]
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # split the test to avoid interference
    - VLLM_USE_V1=1 pytest -v -s v1/core
    - VLLM_USE_V1=1 pytest -v -s v1/engine
    - VLLM_USE_V1=1 pytest -v -s v1/sample
    - VLLM_USE_V1=1 pytest -v -s v1/worker
    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - VLLM_USE_V1=1 pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: Examples Test # 25min
  working_dir: "/vllm-workspace/examples"
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
  commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference.py
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 cpu_offload.py
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference_chat.py
+    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference_with_prefix.py
+    - python3 offline_inference/prefix_caching.py
-    - python3 llm_engine_example.py
+    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference_vision_language.py
+    - python3 offline_inference/vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py
+    - python3 offline_inference/vision_language_multi_image.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py
+    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 7min
+- label: Prefix Caching Test # 9min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
  commands:
    - pytest -v -s prefix_caching
- label: Samplers Test # 18min
+- label: Samplers Test # 36min
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
  - tests/conftest.py
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
@ -189,43 +251,50 @@ steps:
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/model_executor/guided_decoding
  - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
+  - tests/model_executor/test_guided_processors
  commands:
    - pytest -v -s test_logits_processor.py
    - pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 22min
+- label: Speculative decoding tests # 40min
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
  - vllm/model_executor/models/eagle.py
  commands:
    # See https://github.com/vllm-project/vllm/issues/5152
    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 30min each
+- label: LoRA Test %N # 15min each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
  parallelism: 4
- label: "PyTorch Fullgraph Smoke Test"
+- label: "PyTorch Fullgraph Smoke Test" # 9min
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_full_graph_smoke.py
+  - pytest -v -s compile/test_basic_correctness.py
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
- label: "PyTorch Fullgraph Test"
+- label: "PyTorch Fullgraph Test" # 18min
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 30min each
+- label: Kernels Test %N # 1h each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
@ -252,15 +321,14 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh
- label: Quantization Test # 15min
+- label: Quantization Test # 33min
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  command: pytest -v -s quantization
+  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 - label: LM Eval Small Models # 53min
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@ -268,10 +336,17 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1
 - label: OpenAI API correctness
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/
 - label: Encoder Decoder tests # 5min
  source_file_dependencies:
  - vllm/
@ -290,42 +365,93 @@ steps:
 #####  models test  #####
- label: Basic Models Test # 3min
+- label: Basic Models Test # 24min
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
+    - pytest -v -s models/test_transformers.py
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
+    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+    - pytest -v -s models/test_initialization.py
- label: Decoder-only Language Models Test # 1h3min
+- label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/language
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
-    - pytest -v -s models/decoder_only/language
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/language -m core_model
- label: Decoder-only Multi-Modal Models Test # 56min
+- label: Language Models Test (Extended) # 1h10min
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/language
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/language -m 'not core_model'
 - label: Multi-Modal Models Test (Standard) # 40min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
  - tests/models/embedding/vision_language
  - tests/models/encoder_decoder/audio_language
  - tests/models/encoder_decoder/vision_language
  commands:
-    - pytest -v -s models/decoder_only/audio_language
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language
+    - pytest -v -s models/multimodal
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/vision_language -m core_model
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Other Models Test # 5min
+- label: Multi-Modal Models Test (Extended) 1 # 48m
-  #mirror_hardwares: [amd]
+  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/embedding/language
+  - tests/models/decoder_only/audio_language
-  - tests/models/encoder_decoder/language
+  - tests/models/decoder_only/vision_language
  - tests/models/embedding/vision_language
  - tests/models/encoder_decoder/vision_language
  commands:
-    - pytest -v -s models/embedding/language
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/encoder_decoder/language
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
    # HACK - run phi3v tests separately to sidestep this transformers bug
    # https://github.com/huggingface/transformers/issues/34307
    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 - label: Multi-Modal Models Test (Extended) 2 # 38m
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -352,13 +478,13 @@ steps:
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 28min
+- label: Distributed Tests (2 GPUs) # 40min
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -369,20 +495,49 @@ steps:
  - vllm/model_executor/models/
  - tests/distributed/
  - vllm/compilation
  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  commands:
-  - pytest -v -s ./compile/test_full_graph_multi_gpu.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  fast_check: true
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
  # begin platform plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
- label: Multi-step Tests (4 GPUs) # 21min
+- label: Multi-step Tests (4 GPUs) # 36min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -397,10 +552,12 @@ steps:
  - vllm/engine
  - tests/multi_step
  commands:
-  - pytest -v -s multi_step/test_correctness_async_llm.py
+  # this test is quite flaky
  # TODO: investigate and fix.
  # - pytest -v -s multi_step/test_correctness_async_llm.py
  - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 23min
+- label: Pipeline Parallelism Test # 45min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -413,20 +570,25 @@ steps:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA Long Context (Distributed) # 11min
+- label: LoRA TP Test (Distributed)
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  num_gpus: 4
  soft_fail: true
  source_file_dependencies:
  - vllm/lora
-  - tests/lora/test_long_context
+  - tests/lora
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # This test runs llama 13B, so it is required to run on 4 GPUs.
    - pytest -v -s -x lora/test_long_context.py
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_minicpmv_tp.py
- label: Weight Loading Multiple GPU Test
+
 - label: Weight Loading Multiple GPU Test  # 33min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -452,6 +614,7 @@ steps:
 - label: Distributed Tests (A100) # optional
  gpu: a100
  optional: true
  num_gpus: 4
  source_file_dependencies:
  - vllm/
@ -459,17 +622,18 @@ steps:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@ -0,0 +1,71 @@
 #!/usr/bin/env bash
 set -ex
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 # Check that exactly one wheel is found
 if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
 # Rename 'linux' to 'manylinux1' in the wheel filename
 new_wheel="${wheel/linux/manylinux1}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
 echo "Version: $version"
 normal_wheel="$wheel" # Save the original wheel filename
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
    suffix="${version##*.}"
    if [[ $suffix == cu* ]]; then
        new_version="1.0.0.dev+${suffix}"
    else
        new_version="1.0.0.dev"
    fi
    new_wheel="${wheel/$version/$new_version}"
    # use cp to keep both files in the artifacts directory
    cp -- "$wheel" "$new_wheel"
    wheel="$new_wheel"
    version="$new_version"
 fi
 # Upload the wheel to S3
 python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 # generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 else
    # only upload index.html for cu12 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 else
    # only upload index.html for cu12 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1,33 @@
 vllm/*.so
 /.venv
 /build
 dist
 vllm/*.so
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 .mypy_cache
 # Distribution / packaging
 .Python
 /build/
 cmake-build-*/
 CMakeUserPresets.json
 develop-eggs/
 /dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1,36 @@
 # See https://help.github.com/articles/about-codeowners/
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 /vllm/model_executor/guided_decoding @mgoin
 /vllm/multimodal @DarkLight1337 @ywang96
 CMakeLists.txt @tlrmchlsmth
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 # Test ownership
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/quantization @mgoin @robertgshaw2-redhat
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/multi_step @alexm-redhat @comaniac
 /tests/weight_loading @mgoin @youkaichao
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -1,2 +1,2 @@
 github: [vllm-project]
-open_collective: [vllm]
+open_collective: vllm
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -30,15 +30,6 @@ body:
      </details>
  validations:
    required: true
 - type: textarea
  attributes:
    label: Model Input Dumps
    description: |
      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
    placeholder: |
      Upload the dumped input file.
  validations:
    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
--- a/.github/ISSUE_TEMPLATE/500-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -2,73 +2,5 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
-**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+<!--- pyml disable-next-line no-emphasis-as-heading -->
-
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
 ---
 <details>
 <!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
 <summary><b> PR Checklist (Click to Expand) </b></summary>
 <p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
 <h3>PR Title and Classification</h3>
 <p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
 <ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
 </ul>
 <p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
 <h3>Code Quality</h3>
 <p>The PR need to meet the following code quality standards:</p>
 <ul>
    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 <h3>Adding or changing kernels</h3>
 <p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
 <ul>
    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
 </ul>
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
 <h3>What to Expect for the Reviews</h3>
 <p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
 <ul>
    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
 </li>
 </ul>
 <h3>Thank You</h3>
 <p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
 </details>
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,31 @@
 version: 2
 updates:
  # Maintain dependencies for GitHub Actions
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "weekly"
    labels: ["dependencies"]
    open-pull-requests-limit: 5
    reviewers: ["khluu", "simon-mo"]
    allow:
      - dependency-type: "all"
    ignore:
      - dependency-name: "*"
        update-types: ["version-update:semver-patch"]
      - dependency-name: "torch"
      - dependency-name: "torchvision"
      - dependency-name: "xformers"
      - dependency-name: "lm-format-enforcer"
      - dependency-name: "gguf"
      - dependency-name: "compressed-tensors"
      - dependency-name: "ray[adag]"
      - dependency-name: "lm-eval"
    groups:
      minor-update:
        applies-to: version-updates
        update-types: ["minor"]
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -0,0 +1,97 @@
 pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
  actions:
    label:
      add:
        - documentation
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
      - files~=^Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
    label:
      add:
        - ci/build
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
    - files~=^vllm/entrypoints/
  actions:
    label:
      add:
        - frontend
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
    - or:
      - files~=^vllm/model_executor/guided_decoding/
      - files=tests/model_executor/test_guided_processors.py
      - files=tests/entrypoints/llm/test_guided_generate.py
      - files=benchmarks/benchmark_serving_guided.py
      - files=benchmarks/benchmark_guided.py
  actions:
    label:
      add:
        - structured-output
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
    - or:
      - files~=^vllm/spec_decode/
      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
      - files~=^tests/spec_decode/
  actions:
    label:
      add:
        - speculative-decoding
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
  actions:
    label:
      add:
        - v1
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
      - -closed
  actions:
    label:
      add:
        - needs-rebase
    comment:
      message: |
       This pull request has merge conflicts that must be resolved before it can be
       merged. Please rebase the PR, @{{author}}.
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
      - -closed
  actions:
    label:
      remove:
        - needs-rebase
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -0,0 +1,50 @@
 #!/bin/bash
 set -eu
 # ensure 1 argument is passed
 if [ "$#" -ne 1 ]; then
    echo "Usage: $0 <pr_number>"
    exit 1
 fi
 PR_NUMBER=$1
 OLD=/tmp/orig_pr_body.txt
 NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"
 # Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
 sed -i '/FIX #xxxx.*$/d' "${NEW}"
 # Remove "FILL IN THE PR DESCRIPTION HERE"
 sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
 # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
 sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 python3 - <<EOF
 import re
 with open("${NEW}", "r") as file:
    content = file.read()
 pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
 content = re.sub(pattern, '', content)
 with open("${NEW}", "w") as file:
    file.write(content)
 EOF
 # Run this only if ${NEW} is different than ${OLD}
 if ! cmp -s "${OLD}" "${NEW}"; then
    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
    echo
    echo "Updated PR body:"
    echo
    cat "${NEW}"
 else
    echo "No changes needed"
 fi
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -8,7 +8,7 @@ jobs:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
-                uses: actions/github-script@v5
+                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
                with:
                    script: |
                        github.rest.issues.addLabels({
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -1,41 +0,0 @@
 name: clang-format
 on:
  # Trigger the workflow on push or pull request,
  # but only for the main branch
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
 jobs:
  clang-format:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install clang-format==18.1.5
    - name: Running clang-format
      run: |
        EXCLUDES=(
            'csrc/moe/topk_softmax_kernels.cu'
            'csrc/quantization/gguf/ggml-common.h'
            'csrc/quantization/gguf/dequantize.cuh'
            'csrc/quantization/gguf/vecdotq.cuh'
            'csrc/quantization/gguf/mmq.cuh'
            'csrc/quantization/gguf/mmvq.cuh'
        )
        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
            | xargs clang-format --dry-run --Werror
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -0,0 +1,26 @@
 name: Cleanup PR Body
 on:
  pull_request_target:
    types: [opened, reopened, edited]
 permissions:
  pull-requests: write
 jobs:
  update-description:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.12'
      - name: Update PR description
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -0,0 +1,82 @@
 name: Lint and Deploy Charts
 on: pull_request
 jobs:
  lint-and-deploy:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
      - name: Set up Helm
        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
        with:
          version: v3.14.4
       #Python is required because ct lint runs Yamale and yamllint which require Python.
      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.13'
      - name: Set up chart-testing
        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
        with:
          version: v3.10.1
      - name: Run chart-testing (lint)
        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
      - name: Setup minio
        run: |
          docker network create vllm-net
          docker run -d -p 9000:9000 --name minio --net vllm-net \
                     -e "MINIO_ACCESS_KEY=minioadmin" \
                     -e "MINIO_SECRET_KEY=minioadmin" \
                     -v /tmp/data:/data \
                     -v /tmp/config:/root/.minio \
                     minio/minio server /data
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          export AWS_EC2_METADATA_DISABLED=true
          mkdir opt-125m
          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
      - name: Create kind cluster
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
      - name: Build the Docker image vllm cpu
        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
          docker pull amazon/aws-cli:2.6.4
          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
          kind load docker-image vllm-cpu-env:latest --name chart-testing
          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
          kubectl create ns ns-vllm
      - name: Run chart-testing (install)
        run: |
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
          sleep 10
          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
                  --header "Content-Type: application/json" \
                  --data '{
                          "model": "opt-125m",
                          "prompt": "San Francisco is a",
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
          echo "$CODE"
--- a/.github/workflows/matchers/actionlint.json
+++ b/.github/workflows/matchers/actionlint.json
@ -0,0 +1,17 @@
 {
  "problemMatcher": [
    {
      "owner": "actionlint",
      "pattern": [
        {
          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
          "file": 1,
          "line": 2,
          "column": 3,
          "message": 4,
          "code": 5
        }
      ]
    }
  ]
 }
--- a/.github/workflows/matchers/mypy.json
+++ b/.github/workflows/matchers/mypy.json
@ -0,0 +1,16 @@
 {
  "problemMatcher": [
    {
      "owner": "mypy",
      "pattern": [
        {
          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
          "file": 1,
          "line": 2,
          "severity": 3,
          "message": 4
        }
      ]
    }
  ]
 }
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -1,46 +0,0 @@
 name: mypy
 on:
  # Trigger the workflow on push or pull request,
  # but only for the main branch
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
 jobs:
  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install mypy==1.11.1
        pip install types-setuptools
        pip install types-PyYAML
        pip install types-requests
        pip install types-setuptools
    - name: Mypy
      run: |
        mypy
        mypy tests --follow-imports skip
        mypy vllm/attention --follow-imports skip
        mypy vllm/distributed --follow-imports skip
        mypy vllm/engine  --follow-imports skip
        mypy vllm/executor --follow-imports skip
        mypy vllm/lora --follow-imports skip
        mypy vllm/model_executor  --follow-imports skip
        mypy vllm/prompt_adapter --follow-imports skip
        mypy vllm/spec_decode --follow-imports skip
        mypy vllm/worker --follow-imports skip
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,20 @@
 name: pre-commit
 on:
  pull_request:
  push:
    branches: [main]
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
      with:
        extra_args: --all-files --hook-stage manual
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -21,16 +21,16 @@ jobs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Extract branch info
        shell: bash
        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
      - name: Create Release
        id: create_release
-        uses: "actions/github-script@v6"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
@ -39,67 +39,68 @@ jobs:
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)
-  wheel:
+  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
-    name: Build Wheel
+  # wheel:
-    runs-on: ${{ matrix.os }}
+  #   name: Build Wheel
-    needs: release
+  #   runs-on: ${{ matrix.os }}
  #   needs: release
-    strategy:
+  #   strategy:
-      fail-fast: false
+  #     fail-fast: false
-      matrix:
+  #     matrix:
-          os: ['ubuntu-20.04']
+  #         os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
-          cuda-version: ['11.8', '12.1']
+  #         cuda-version: ['11.8', '12.1']
-    steps:
+  #   steps:
-      - name: Checkout
+  #     - name: Checkout
-        uses: actions/checkout@v3
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Setup ccache
+  #     - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@v1.2
+  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-        with:
+  #       with:
-          create-symlink: true
+  #         create-symlink: true
-          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-      - name: Set up Linux Env
+  #     - name: Set up Linux Env
-        if: ${{ runner.os == 'Linux' }}
+  #       if: ${{ runner.os == 'Linux' }}
-        run: |
+  #       run: |
-          bash -x .github/workflows/scripts/env.sh
+  #         bash -x .github/workflows/scripts/env.sh
-      - name: Set up Python
+  #     - name: Set up Python
-        uses: actions/setup-python@v4
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
+  #       with:
-            python-version: ${{ matrix.python-version }}
+  #           python-version: ${{ matrix.python-version }}
-      - name: Install CUDA ${{ matrix.cuda-version }}
+  #     - name: Install CUDA ${{ matrix.cuda-version }}
-        run: |
+  #       run: |
-          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-        run: |
+  #       run: |
-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-      - name: Build wheel
+  #     - name: Build wheel
-        shell: bash
+  #       shell: bash
-        env:
+  #       env:
-          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-        run: |
+  #       run: |
-          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-          asset_name=${wheel_name//"linux"/"manylinux1"}
+  #         asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-      - name: Upload Release Asset
+  #     - name: Upload Release Asset
-        uses: actions/upload-release-asset@v1
+  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-        env:
+  #       env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
+  #       with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
+  #         upload_url: ${{ needs.release.outputs.upload_url }}
-          asset_path: ./dist/${{ env.wheel_name }}
+  #         asset_path: ./dist/${{ env.wheel_name }}
-          asset_name: ${{ env.asset_name }}
+  #         asset_name: ${{ env.asset_name }}
-          asset_content_type: application/*
+  #         asset_content_type: application/*
      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
      # - name: Publish package
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -2,20 +2,24 @@ name: PR Reminder Comment Bot
 on:
  pull_request_target:
    types: [opened]
 jobs:
  pr_reminder:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
            github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
                '🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -1,37 +0,0 @@
 name: ruff
 on:
  # Trigger the workflow on push or pull request,
  # but only for the main branch
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
 jobs:
  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -r requirements-lint.txt
    - name: Analysing the code with ruff
      run: |
        ruff check .
    - name: Spelling check with codespell
      run: |
        codespell --toml pyproject.toml
    - name: Run isort
      run: |
        isort . --check-only
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -eux
 python_executable=python$1
 cuda_home=/usr/local/cuda-$2
@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 # Install requirements
-$python_executable -m pip install wheel packaging
+$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
 $python_executable -m pip install -r requirements-cuda.txt
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 bash tools/check_repo.sh
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@ -1,16 +1,16 @@
 #!/bin/bash
 # Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
+cuda_version=$(echo "$1" | tr "." "-")
 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
+OS=$(echo "$2" | tr -d ".\-")
 # Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
 rm cuda-keyring_1.1-1_all.deb
 sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
 sudo apt clean
 # Test nvcc
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@ -6,7 +6,7 @@ cuda_version=$3
 # Install torch
 $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
 # Print version information
 $python_executable --version
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -0,0 +1,52 @@
 name: 'Close inactive issues and PRs'
 on:
  schedule:
    # Daily at 1:30 AM UTC
    - cron: '30 1 * * *'
 jobs:
  close-issues-and-pull-requests:
    permissions:
      issues: write
      pull-requests: write
      actions: write
    runs-on: ubuntu-latest
    steps:
      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
          operations-per-run: 1000
          exempt-draft-pr: true
          exempt-issue-labels: 'keep-open'
          exempt-pr-labels: 'keep-open'
          labels-to-add-when-unstale: 'unstale'
          labels-to-remove-when-stale: 'unstale'
          days-before-issue-stale: 90
          days-before-issue-close: 30
          stale-issue-label: 'stale'
          stale-issue-message: >
            This issue has been automatically marked as stale because it has not
            had any activity within 90 days. It will be automatically closed if no
            further activity occurs within 30 days. Leave a comment if
            you feel this issue should remain open. Thank you!
          close-issue-message: >
            This issue has been automatically closed due to inactivity. Please
            feel free to reopen if you feel it is still relevant. Thank you!
          days-before-pr-stale: 90
          days-before-pr-close: 30
          stale-pr-label: 'stale'
          stale-pr-message: >
            This pull request has been automatically marked as stale because it
            has not had any activity within 90 days. It will be automatically
            closed if no further activity occurs within 30 days. Leave a comment
            if you feel this pull request should remain open. Thank you!
          close-pr-message: >
            This pull request has been automatically closed due to inactivity.
            Please feel free to reopen if you intend to continue working on it.
            Thank you!
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -1,31 +0,0 @@
 name: yapf
 on:
  # Trigger the workflow on push or pull request,
  # but only for the main branch
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
 jobs:
  yapf:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install yapf==0.32.0
        pip install toml==0.10.2
    - name: Running yapf
      run: |
        yapf --diff --recursive .
--- a/.gitignore
+++ b/.gitignore
@ -33,6 +33,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
 /.deps/
 # PyInstaller
 #  Usually these files are written by a python script from a template
@ -78,8 +79,7 @@ instance/
 # Sphinx documentation
 docs/_build/
-docs/source/getting_started/examples/*.rst
+docs/source/getting_started/examples/
 !**/*.template.rst
 # PyBuilder
 .pybuilder/
@ -198,3 +198,7 @@ hip_compat.h
 # Benchmark dataset
 benchmarks/*.json
 # Linting
 actionlint
 shellcheck*/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,138 @@
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
  rev: v0.43.0
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.9.3
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/codespell-project/codespell
  rev: v2.4.0
  hooks:
  - id: codespell
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
  rev: 5.13.2
  hooks:
  - id: isort
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v19.1.7
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
  rev: v0.9.27
  hooks:
  - id: pymarkdown
    args: [fix]
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
  - id: actionlint
    exclude: 'vllm/third_party/.*'
 - repo: local
  hooks:
  - id: mypy-local
    name: Run mypy for local Python installation
    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
    stages: [pre-commit] # Don't run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
    entry: tools/mypy.sh 1 "3.9"
    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: tools/mypy.sh 1 "3.10"
    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
    entry: tools/mypy.sh 1 "3.11"
    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
    entry: tools/mypy.sh 1 "3.12"
    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: shellcheck
    name: Lint shell scripts
    entry: tools/shellcheck.sh
    language: script
    types: [shell]
    exclude: 'vllm/third_party/.*'
  - id: png-lint
    name: Lint PNG exports from excalidraw
    entry: tools/png-lint.sh
    language: script
    types: [png]
    exclude: 'vllm/third_party/.*'
  - id: signoff-commit
    name: Sign-off Commit
    entry: bash
    args:
      - -c
      - |
        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
        fi
    language: system
    verbose: true
    stages: [commit-msg]
    exclude: 'vllm/third_party/.*'
  - id: check-spdx-header
    name: Check SPDX headers
    entry: python tools/check_spdx_header.py
    language: python
    types: [python]
    exclude: 'vllm/third_party/.*'
  - id: check-filenames
    name: Check for spaces in all filenames
    entry: bash
    args:
      - -c
      - 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
    language: system
    always_run: true
    pass_filenames: false
    exclude: 'vllm/third_party/.*'
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
    language: system
    verbose: true
    pass_filenames: false
    exclude: 'vllm/third_party/.*'
  # Insert new entries above the `suggestion` entry
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -6,15 +6,14 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.8"
+    python: "3.12"
 sphinx:
  configuration: docs/source/conf.py
  fail_on_warning: true
 # If using Sphinx, optionally build your docs in additional formats such as PDF
-formats:
+formats: []
   - pdf
 # Optionally declare the Python requirements required to build your docs
 python:
--- a/.shellcheckrc
+++ b/.shellcheckrc
@ -0,0 +1,9 @@
 # rules currently disabled:
 #
 #   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
 #   SC2004 (style): $/${} is unnecessary on arithmetic variables.
 #   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
 #   SC2155 (warning): Declare and assign separately to avoid masking return values.
 #   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
 #
 disable=SC1091,SC2004,SC2129,SC2155,SC2164
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,20 +24,17 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 # Prevent installation of dependencies (cutlass) by default.
 install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -49,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
 #
 # Try to find python package with an executable that exactly matches
@ -83,24 +80,6 @@ endif()
 #
 find_package(Torch REQUIRED)
 #
 message(STATUS "Enabling core extension.")
 # Define _core_C extension
 #  built for (almost) every target platform, (excludes TPU and Neuron)
 set(VLLM_EXT_SRC
  "csrc/core/torch_bindings.cpp")
 define_gpu_extension_target(
  _core_C
  DESTINATION vllm
  LANGUAGE CXX
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
  USE_SABI 3
  WITH_SOABI)
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -143,14 +122,32 @@ else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
-# Override the GPU architectures detected by cmake/torch and filter them by
+  # For cuda we want to be able to control which architectures we compile for on
-# the supported versions for the current language.
+  # a per-file basis in order to cut down on compile time. So here we extract
  # the set of architectures we want to compile for and remove the from the
  # CMAKE_CUDA_FLAGS so that they are not applied globally.
  #
  clear_cuda_arches(CUDA_ARCH_FLAGS)
  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
  # Filter the target architectures by the supported supported archs
  # since for some files we will build for all CUDA_ARCHS.
  cuda_archs_loose_intersection(CUDA_ARCHS
    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
 else()
  #
  # For other GPU targets override the GPU architectures detected by cmake/torch
  # and filter them by the supported versions for the current language.
  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
  #
  override_gpu_arches(VLLM_GPU_ARCHES
    ${VLLM_GPU_LANG}
    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
 endif()
 #
 # Query torch for additional GPU compilation flags for the given
@ -166,27 +163,64 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 #
 # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
 # Each dependency that produces build artifacts should override its BINARY_DIR to avoid
 # conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
 #
 include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 #
 # Define other extension targets
 #
 #
 # cumem_allocator extension
 #
 set(VLLM_CUMEM_EXT_SRC
  "csrc/cumem_allocator.cpp")
 set_gencode_flags_for_srcs(
  SRCS "${VLLM_CUMEM_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  message(STATUS "Enabling cumem allocator extension.")
  # link against cuda driver library
  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
  define_gpu_extension_target(
    cumem_allocator
    DESTINATION vllm
    LANGUAGE CXX
    SOURCES ${VLLM_CUMEM_EXT_SRC}
    LIBRARIES ${CUMEM_LIBS}
    USE_SABI 3.8
    WITH_SOABI)
 endif()
 #
 # _C extension
 #
 set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
-  "csrc/attention/attention_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/moe_align_block_size_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/torch_bindings.cpp")
@ -194,12 +228,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  # Please keep this in sync with FetchContent_Declare line below.
  set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
    set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
  endif()
  if(VLLM_CUTLASS_SRC_DIR)
    if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
      get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
    endif()
    message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
    FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
  else()
    FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        # Please keep this in sync with CUTLASS_REVISION line above.
        GIT_TAG v3.7.0
        GIT_PROGRESS TRUE
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -207,6 +255,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
        GIT_SHALLOW TRUE
    )
  endif()
  FetchContent_MakeAvailable(cutlass)
  list(APPEND VLLM_EXT_SRC
@ -214,46 +263,163 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp")
  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
    CUDA_ARCHS "${CUDA_ARCHS}")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
+       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
-    "csrc/quantization/gguf/gguf_kernel.cu"
+    set_gencode_flags_for_srcs(
-    "csrc/quantization/fp8/fp8_marlin.cu"
+      SRCS "${MARLIN_SRCS}"
-    "csrc/custom_all_reduce.cu"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
-    "csrc/permute_cols.cu"
+    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+  else()
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    message(STATUS "Not building Marlin kernels as no compatible archs found"
-
+                   " in CUDA target architectures")
  #
  # The CUTLASS kernels for Hopper require sm90a to be enabled.
  # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
    set_source_files_properties(
          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
          PROPERTIES
          COMPILE_FLAGS
          "-gencode arch=compute_90a,code=sm_90a")
  endif()
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
    set(SRCS 
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running FP8 quantized models on "
                     "Hopper.")
    else()
      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
                     "in CUDA target architectures")
    endif()
    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
    # build any 3x kernels
    set(SCALED_MM_3X_ARCHS)
  endif()
  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
  else()
    if (SCALED_MM_3X_ARCHS)
      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
                     " for and covered by scaled_mm_c3x")
    else()
      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
                    "in CUDA target architectures")
    endif()
  endif()
  #
  # 2:4 Sparse Kernels
  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                     "if you intend on running FP8 sparse quantized models on Hopper.")
    else()
      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
  # FP4 Archs and flags
  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
    set(SRCS 
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
    # clear FP4_ARCHS
    set(FP4_ARCHS)
  endif()
  #
  # Machete kernels
  # The machete kernels only work on hopper and require CUDA 12.0 or later.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+  # Only build Machete kernels if we are building for something compatible with sm90a
  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
    #
    # For the Machete kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
    # Generate sources:
    set(MACHETE_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
-        ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
+          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
        RESULT_VARIABLE machete_generation_result
        OUTPUT_VARIABLE machete_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
@ -266,26 +432,40 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
      else()
        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
            CACHE STRING "Last run machete generate script hash" FORCE)
        message(STATUS "Machete generation completed successfully.")
      endif()
    else()
      message(STATUS "Machete generation script has not changed, skipping generation.")
    endif()
    # Add machete generated sources
    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
    message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
-    set_source_files_properties(
+    # forward compatible
-          ${MACHETE_GEN_SOURCES}
+    set_gencode_flags_for_srcs(
-          PROPERTIES
+      SRCS "${MACHETE_GEN_SOURCES}"
-          COMPILE_FLAGS
+      CUDA_ARCHS "${MACHETE_ARCHS}")
          "-gencode arch=compute_90a,code=sm_90a")
  endif()
  # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
  #  raise an error if the user that this was built with an incompatible 
  #  CUDA version)
    list(APPEND VLLM_EXT_SRC
      csrc/quantization/machete/machete_pytorch.cu)
    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
        AND MACHETE_ARCHS)
      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running w4a16 quantized models on "
                     "Hopper.")
    else()
      message(STATUS "Not building Machete kernels as no compatible archs "
                     "found in CUDA target architectures")
    endif()
  endif()
 # if CUDA endif
 endif()
 message(STATUS "Enabling C extension.")
@ -312,16 +492,36 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")
 set_gencode_flags_for_srcs(
  SRCS "${VLLM_MOE_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    set(MARLIN_MOE_SRC
        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
        "csrc/moe/marlin_moe_ops.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
 endif()
 message(STATUS "Enabling moe extension.")
@ -355,10 +555,21 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 # vllm-flash-attn currently only supported on CUDA
-if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
+if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
  return()
 endif ()
 # vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
 # arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
 # arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  foreach(_ARCH ${CUDA_ARCHS})
    string(REPLACE "." "" _ARCH "${_ARCH}")
    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
  endforeach()
 endif()
 #
 # Build vLLM flash attention from source
 #
@ -367,7 +578,7 @@ endif ()
 # They should be identical but if they aren't, this is a massive footgun.
 #
 # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
-# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
+# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
 # If no component is specified, vllm-flash-attn is still installed.
 # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
@ -379,40 +590,40 @@ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
 endif()
 if(VLLM_FLASH_ATTN_SRC_DIR)
-  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
+  FetchContent_Declare(
          vllm-flash-attn SOURCE_DIR 
          ${VLLM_FLASH_ATTN_SRC_DIR}
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
 else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
 endif()
 # Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
 set(VLLM_PARENT_BUILD ON)
 # Ensure the vllm/vllm_flash_attn directory exists before installation
 install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
 # Make sure vllm-flash-attn install rules are nested under vllm/
 install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
 install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
 install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
-# Restore the install prefix
+# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+# case only one is built, in the case both are built redundant work is done)
 install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
 # Copy over the vllm-flash-attn python files
 install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-        DESTINATION vllm/vllm_flash_attn
+  DESTINATION vllm_flash_attn
-        COMPONENT vllm_flash_attn_c
+  COMPONENT _vllm_fa2_C
  FILES_MATCHING PATTERN "*.py"
 )
 install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
  DESTINATION vllm_flash_attn
  COMPONENT _vllm_fa3_C
  FILES_MATCHING PATTERN "*.py"
 )
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+channel in the [vLLM Slack](https://slack.vllm.ai).
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
 For answers to common questions about this code of conduct, see the
 [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
 [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,56 +1,3 @@
 # Contributing to vLLM
-Thank you for your interest in contributing to vLLM!
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
 Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
 There are several ways you can contribute to the project:
 - Identify and report any issues or bugs.
 - Request or add a new model.
 - Suggest or implement new features.
 However, remember that contributions aren't just about code.
 We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
 Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
 Talk about it in your blog posts, highlighting how it's driving your incredible projects.
 Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
 ## Setup for development
 ### Build from source
 ```bash
 pip install -e .  # This may take several minutes.
 ```
 ### Testing
 ```bash
 pip install -r requirements-dev.txt
 # linting and formatting
 bash format.sh
 # Static type checking
 mypy
 # Unit tests
 pytest tests/
 ```
 **Note:** Currently, the repository does not pass the mypy tests.
 ## Contributing Guidelines
 ### Issue Reporting
 If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
 If not, please file a new issue, providing as much relevant information as possible.
 ### Pull Requests & Code Reviews
 Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
 ### Thank You
 Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
 Your contributions make vLLM a great tool for everyone!
--- a/34
+++ b/34
@ -0,0 +1,34 @@
 Developer Certificate of Origin
 Version 1.1
 Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 Everyone is permitted to copy and distribute verbatim copies of this
 license document, but changing it is not allowed.
 Developer's Certificate of Origin 1.1
 By making a contribution to this project, I certify that:
 (a) The contribution was created in whole or in part by me and I
    have the right to submit it under the open source license
    indicated in the file; or
 (b) The contribution is based upon previous work that, to the best
    of my knowledge, is covered under an appropriate open source
    license and I have the right under that license to submit that
    work with modifications, whether created in whole or in part
    by me, under the same open source license (unless I am
    permitted to submit under a different license), as indicated
    in the file; or
 (c) The contribution was provided directly to me by some other
    person who certified (a), (b) or (c) and I have not modified
    it.
 (d) I understand and agree that this project and the contribution
    are public and that a record of the contribution (including all
    personal information I submit with it, including my sign-off) is
    maintained indefinitely and may be redistributed consistent with
    this project or the open source license(s) involved.
--- a/136
+++ b/136
@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/contributing/dockerfile/dockerfile.md and
-# docs/source/assets/dev/dockerfile-stages-dependency.png
+# docs/source/assets/contributing/dockerfile-stages-dependency.png
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
@ -26,6 +27,17 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install uv
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
 RUN apt-get install -y gcc-10 g++-10
 RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
 RUN <<EOF
 gcc --version
 EOF
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -36,11 +48,20 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 # install build and runtime dependencies
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
    fi
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
+    uv pip install --system -r requirements-cuda.txt
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -55,22 +76,18 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-build.txt
+    uv pip install --system -r requirements-build.txt
-# files and directories related to build wheels
+COPY . .
-COPY csrc csrc
+ARG GIT_REPO_CHECK=0
-COPY setup.py setup.py
+RUN --mount=type=bind,source=.git,target=.git \
-COPY cmake cmake
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@ -112,8 +129,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-# Default max size of the wheel is 250MB
+# sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=250
+ARG VLLM_MAX_SIZE_MB=400
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -130,16 +147,18 @@ COPY requirements-lint.txt requirements-lint.txt
 COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
+    uv pip install --system -r requirements-dev.txt
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@ -148,7 +167,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
@ -158,6 +177,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install uv
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -165,16 +187,48 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-# install vllm wheel first, so that torch etc will be installed
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
    fi
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
 # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
 # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
 # $ cd flashinfer
 # $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
 # $ rm -rf build
 # $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
 # $ ls dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-#################### vLLM installation IMAGE ####################
+    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    uv pip install --system -r requirements-build.txt
 #################### vLLM installation IMAGE ####################
 #################### TEST IMAGE ####################
 # image to run unit testing suite
@ -185,7 +239,19 @@ ADD . /vllm-workspace/
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
+    uv pip install --system -r requirements-dev.txt
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
    uv pip install --system -e tests/vllm_test_utils
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@ -193,18 +259,30 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
 #################### TEST IMAGE ####################
 #################### OPENAI API SERVER ####################
-# openai api server alternative
+# base openai image with additional requirements, for any subsequent openai-style images
-FROM vllm-base AS vllm-openai
+FROM vllm-base AS vllm-openai-base
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi
 ENV VLLM_USAGE_SOURCE production-docker-image
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
 COPY examples/online_serving/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 FROM vllm-openai-base AS vllm-openai
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/Dockerfile.arm
+++ b/Dockerfile.arm
@ -0,0 +1,62 @@
 # This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
 FROM ubuntu:22.04 AS cpu-test-arm
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 # tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
 # Set LD_PRELOAD for tcmalloc on ARM
 ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt
 FROM cpu-test-arm AS build
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 # Disabling AVX512 specific optimizations for ARM
 ARG VLLM_CPU_DISABLE_AVX512="true"
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
    pip install dist/*.whl && \
    rm -rf dist
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -16,35 +16,23 @@ RUN --mount=type=cache,target=/var/cache/apt \
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp
+    pip install intel-openmp==2025.0.1
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 RUN echo 'ulimit -c 0' >> ~/.bashrc
-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
+RUN pip install intel_extension_for_pytorch==2.5.0
 WORKDIR /workspace
-ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt
 # install oneDNN
 RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
 RUN --mount=type=cache,target=/root/.cache/ccache \
    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
    -DONEDNN_BUILD_DOC=OFF \ 
    -DONEDNN_BUILD_EXAMPLES=OFF \ 
    -DONEDNN_BUILD_TESTS=OFF \ 
    -DONEDNN_BUILD_GRAPH=OFF \ 
    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
    cmake --build ./oneDNN/build --target install --config Release
 FROM cpu-test-1 AS build
 WORKDIR /workspace/vllm
@ -54,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt
-COPY ./ ./
+COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
@ -71,4 +62,8 @@ WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@ -0,0 +1,21 @@
 FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 RUN pip install -v -r requirements-hpu.txt
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -1,5 +1,6 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
 ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
 FROM $BASE_IMAGE
@ -14,28 +15,41 @@ RUN apt-get update && \
        ffmpeg libsm6 libxext6 libgl1
 ### Mount Point ###
-# When launching the container, mount the code directory to /app
+# When launching the container, mount the code directory to /workspace
-ARG APP_MOUNT=/app
+ARG APP_MOUNT=/workspace
 VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}
+WORKDIR ${APP_MOUNT}/vllm
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
-RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install pytest
-COPY . /app/vllm
+# uninstall transformers-neuronx package explicitly to avoid version conflict
 RUN python3 -m pip uninstall -y transformers-neuronx
-RUN cd /app/vllm \
+COPY . .
-    && python3 -m pip install -U \
+ARG GIT_REPO_CHECK=0
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 RUN python3 -m pip install -U \
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
        -r requirements-neuron.txt
 ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
-    cd /app/vllm \
+    pip install --no-build-isolation -v -e .
-    && pip install --no-build-isolation -v -e . \
+
-    && cd ..
+# install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 # install transformers-neuronx package as an optional dependencies (for V0)
 # FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
 RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
 # overwrite entrypoint to run bash script
 RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
 CMD ["/bin/bash"]
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -9,23 +9,21 @@ RUN apt-get update -y && \
        ffmpeg libsm6 libxext6 libgl1
 WORKDIR /workspace
-# copy requirements
+COPY . .
-COPY requirements-build.txt /workspace/vllm/
+ARG GIT_REPO_CHECK=0
-COPY requirements-common.txt /workspace/vllm/
+RUN --mount=type=bind,source=.git,target=.git \
-COPY requirements-openvino.txt /workspace/vllm/
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 COPY vllm/ /workspace/vllm/vllm
 COPY csrc/core /workspace/vllm/csrc/core
 COPY cmake/utils.cmake /workspace/vllm/cmake/
 COPY CMakeLists.txt /workspace/vllm/
 COPY setup.py /workspace/vllm/
 RUN python3 -m pip install -U pip
 # install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
-COPY examples/ /workspace/vllm/examples
+COPY examples/ /workspace/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
+COPY benchmarks/ /workspace/benchmarks
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -4,30 +4,34 @@ USER root
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
-    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
        torch==2.3.1 \
        -r requirements-cpu.txt \
        xformers uvloop==0.20.0
 RUN --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py install
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -1,166 +1,119 @@
-# Default ROCm 6.2 base image
+# default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
+ARG REMOTE_VLLM="0"
 ARG USE_CYTHON="0"
 ARG BUILD_RPD="1"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base
-# Default ROCm ARCHes to build vLLM for.
+FROM ${BASE_IMAGE} AS base
 ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
-# Whether to install CK-based flash-attention
+ARG ARG_PYTORCH_ROCM_ARCH
-# If 0, will not install flash-attention
+ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 ARG BUILD_FA="1"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 ARG FA_BRANCH="3cea2fb"
 # Whether to build triton on rocm
 ARG BUILD_TRITON="1"
 ARG TRITON_BRANCH="e192dba"
 ### Base image build stage
 FROM $BASE_IMAGE AS base
 # Import arg(s) defined before this build stage
 ARG PYTORCH_ROCM_ARCH
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update -q -y && apt-get install -q -y \
-RUN apt-get update && apt-get install -y \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
-    curl \
+# Remove sccache    
-    ca-certificates \
+RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
    sudo \
    git \
    bzip2 \
    libx11-6 \
    build-essential \
    wget \
    unzip \
    tmux \
    ccache \
 && rm -rf /var/lib/apt/lists/*
 # When launching the container, mount the code directory to /vllm-workspace
 ARG APP_MOUNT=/vllm-workspace
 WORKDIR ${APP_MOUNT}
 RUN python3 -m pip install --upgrade pip
 # Remove sccache so it doesn't interfere with ccache
 # TODO: implement sccache support across components
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
-# Install torch == 2.6.0 on ROCm
+
-RUN --mount=type=cache,target=/root/.cache/pip \
+# -----------------------
-    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+# vLLM fetch stages
-        *"rocm-6.2"*) \
+FROM base AS fetch_vllm_0
-            python3 -m pip uninstall -y torch torchvision \
+ONBUILD COPY ./ vllm/
-            && python3 -m pip install --pre \
+FROM base AS fetch_vllm_1
-                torch==2.6.0.dev20240918 \
+ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
-                setuptools-scm>=8 \
+ARG VLLM_BRANCH="main"
-                torchvision==0.20.0.dev20240918 \
+ONBUILD RUN git clone ${VLLM_REPO} \
-                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
+	    && cd vllm \
 	    && git checkout ${VLLM_BRANCH}
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
 ARG USE_CYTHON
 # Build vLLM
 RUN cd vllm \
    && python3 -m pip install -r requirements-rocm.txt \
    && python3 setup.py clean --all  \
    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
 # -----------------------
 # Test vLLM image
 FROM base AS test
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    cd /install \
    && pip install -U -r requirements-rocm.txt \
    && pip uninstall -y vllm \
    && pip install *.whl
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 # install development dependencies (for testing)
 RUN cd /vllm-workspace \
    && rm -rf vllm \
    && python3 -m pip install -e tests/vllm_test_utils \
    && python3 -m pip install lm-eval[api]==0.4.4 \
    && python3 -m pip install pytest-shard
 # -----------------------
 # Final vLLM image
 FROM base AS final
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually remove it so that later steps of numpy upgrade can continue
 RUN case "$(which python3)" in \
        *"/opt/conda/envs/py_3.9"*) \
            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
        *) ;; esac
-ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
+RUN python3 -m pip install --upgrade huggingface-hub[cli]
-ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
+ARG BUILD_RPD
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
+RUN if [ ${BUILD_RPD} -eq "1" ]; then \
-ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
+    git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
    && cd rocmProfileData/rpd_tracer \
    && pip install -r requirements.txt && cd ../ \
    && make && make install \
    && cd hipMarker && python3 setup.py install ; fi
-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+# Install vLLM
-ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    cd /install \
    && pip install -U -r requirements-rocm.txt \
    && pip uninstall -y vllm \
    && pip install *.whl
 ARG COMMON_WORKDIR
-### AMD-SMI build stage
+# Copy over the benchmark scripts as well
-FROM base AS build_amdsmi
+COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
-# Build amdsmi wheel always
+COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 RUN cd /opt/rocm/share/amd_smi \
    && python3 -m pip wheel . --wheel-dir=/install
 ### Flash-Attention wheel build stage
 FROM base AS build_fa
 ARG BUILD_FA
 ARG FA_GFX_ARCHS
 ARG FA_BRANCH
 # Build ROCm flash-attention wheel if `BUILD_FA = 1`
 RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_FA" = "1" ]; then \
        mkdir -p libs \
        && cd libs \
        && git clone https://github.com/ROCm/flash-attention.git \
        && cd flash-attention \
        && git checkout "${FA_BRANCH}" \
        && git submodule update --init \
        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi
 ### Triton wheel build stage
 FROM base AS build_triton
 ARG BUILD_TRITON
 ARG TRITON_BRANCH
 # Build triton wheel if `BUILD_TRITON = 1`
 RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_TRITON" = "1" ]; then \
    mkdir -p libs \
    && cd libs \
    && python3 -m pip install ninja cmake wheel pybind11 \
    && git clone https://github.com/OpenAI/triton.git \
    && cd triton \
    && git checkout "${TRITON_BRANCH}" \
    && cd python \
    && python3 setup.py bdist_wheel --dist-dir=/install; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi
 ### Final vLLM build stage
 FROM base AS final
 # Import the vLLM development directory from the build context
 COPY . .
 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 # Silences the HF Tokenizers warning
 ENV TOKENIZERS_PARALLELISM=false
-RUN --mount=type=cache,target=${CCACHE_DIR} \
+# Performance environment variable.
-    --mount=type=bind,source=.git,target=.git \
+ENV HIP_FORCE_DEV_KERNARG=1
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -Ur requirements-rocm.txt \
    && python3 setup.py clean --all \
    && python3 setup.py develop
 # Copy amdsmi wheel into final image
 RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
    mkdir -p libs \
    && cp /install/*.whl libs \
    # Preemptively uninstall to avoid same-version no-installs
    && python3 -m pip uninstall -y amdsmi;
 # Copy triton wheel(s) into final image if they were built
 RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
    mkdir -p libs \
    && if ls /install/*.whl; then \
        cp /install/*.whl libs \
        # Preemptively uninstall to avoid same-version no-installs
        && python3 -m pip uninstall -y triton; fi
 # Copy flash-attn wheel(s) into final image if they were built
 RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
    mkdir -p libs \
    && if ls /install/*.whl; then \
        cp /install/*.whl libs \
        # Preemptively uninstall to avoid same-version no-installs
        && python3 -m pip uninstall -y flash-attn; fi
 # Install wheels that were built to the final image
 RUN --mount=type=cache,target=/root/.cache/pip \
    if ls libs/*.whl; then \
    python3 -m pip install libs/*.whl; fi
 CMD ["/bin/bash"]
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@ -0,0 +1,158 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
 ARG HIPBLASLT_BRANCH="4d40e36"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
 ARG PYTORCH_BRANCH="3a585126"
 ARG PYTORCH_VISION_BRANCH="v0.19.1"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="b7d29fb"
 ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
 FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ARG PYTHON_VERSION=3.12
 RUN mkdir -p /app
 WORKDIR /app
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN apt-get update -y \
    && apt-get install -y software-properties-common git curl sudo vim less \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
 ARG HIPBLAS_COMMON_BRANCH
 # Set to "--legacy_hipblas_direct" for ROCm<=6.2
 ARG LEGACY_HIPBLASLT_OPTION
 RUN git clone https://github.com/ROCm/hipBLAS-common.git
 RUN cd hipBLAS-common \
    && git checkout ${HIPBLAS_COMMON_BRANCH} \
    && mkdir build \
    && cd build \
    && cmake .. \
    && make package \
    && dpkg -i ./*.deb
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
 FROM base AS build_rccl
 ARG RCCL_BRANCH
 ARG RCCL_REPO
 RUN git clone ${RCCL_REPO}
 RUN cd rccl \
    && git checkout ${RCCL_BRANCH} \
    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
 RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 RUN git clone ${TRITON_REPO}
 RUN cd triton \
    && git checkout ${TRITON_BRANCH} \
    && cd python \
    && python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
    && pip wheel . --wheel-dir=dist
 RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
    pip install -r requirements.txt && git submodule update --init --recursive \
    && python3 tools/amd_build/build_amd.py \
    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
 RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 ARG BASE_IMAGE
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,29 +1,28 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20250124"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 FROM $BASE_IMAGE
-WORKDIR /workspace
+WORKDIR /workspace/vllm
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
    git \
    ffmpeg libsm6 libxext6 libgl1
 # Install the TPU and Pallas dependencies.
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 # Build vLLM.
-COPY . /workspace/vllm
+COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
     cd /workspace/vllm && \
    python3 -m pip install \
        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
        -r requirements-tpu.txt
-RUN cd /workspace/vllm && python3 setup.py develop
+RUN python3 setup.py develop
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,4 +1,4 @@
-FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@ -8,19 +8,62 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update -y && \
-    apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
+    apt-get install -y --no-install-recommends --fix-missing \
-
+    curl \
-COPY ./ /workspace/vllm
+    ffmpeg \
    git \
    libsndfile1 \
    libsm6 \
    libxext6 \
    libgl1 \
    lsb-release \
    numactl \
    python3 \
    python3-dev \
    python3-pip \
    # vim \
    wget
 WORKDIR /workspace/vllm
 COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
 COPY requirements-common.txt /workspace/vllm/requirements-common.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
+    pip install --no-cache-dir \
        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
    -r requirements-xpu.txt
 RUN git clone https://github.com/intel/pti-gpu && \
    cd pti-gpu/sdk && \
    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
    mkdir build && \
    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
    make -j && \
    cmake --install . --config Release --prefix "/usr/local"
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 COPY . .
 ARG GIT_REPO_CHECK
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 ENV VLLM_TARGET_DEVICE=xpu
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=xpu python3 setup.py install
+    python3 setup.py install
 CMD ["/bin/bash"]
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate hf_transfer 'modelscope!=1.15.0'
 ENV VLLM_USAGE_SOURCE production-docker-image \
    TRITON_XPU_PROFILE 1
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/README.md
+++ b/README.md
@ -10,22 +10,23 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
-**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
+We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
 We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
 Join us to learn more about recent advancements of vLLM on MI300X.
 Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
 ---
 *Latest News* 🔥
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
@ -37,13 +38,17 @@ Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 ---
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 vLLM is fast with:
 - State-of-the-art serving throughput
- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
@ -51,7 +56,7 @@ vLLM is fast with:
 - Speculative decoding
 - Chunked prefill
-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
 vLLM is flexible and easy to use with:
@ -66,7 +71,7 @@ vLLM is flexible and easy to use with:
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral)
+- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
 - Embedding Models (e.g. E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)
@ -74,16 +79,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod
 ## Getting Started
-Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
 ```bash
 pip install vllm
 ```
-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
+Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
+- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
 ## Contributing
@ -96,33 +101,40 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
-
+Cash Donations:
 - a16z
 - Dropbox
 - Sequoia Capital
 - Skywork AI
 - ZhenFund
 Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
 - Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
 - Sequoia Capital
 - Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
- ZhenFund
+
 Slack Sponsor: Anyscale
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
 ## Citation
 If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
 ```bibtex
@inproceedings{kwon2023efficient,
  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
@ -134,7 +146,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
-* For technical questions and feature requests, please use Github issues or discussions.
+- For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users, please use Discord.
+- For discussing with fellow users and coordinating contributions and development, please use Slack.
-* For security disclosures, please use Github's security advisory feature.
+- For security disclosures, please use Github's security advisory feature.
-* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
 ## Media Kit
 - If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -2,11 +2,10 @@
 ## Reporting a Vulnerability
-If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. 
+If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 We will investigate all legitimate reports and do our best to quickly fix the problem.
-Please report security issues using https://github.com/vllm-project/vllm/security/advisories/new
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
 ---
-Please see PyTorch Security for more information how to securely interact with models: https://github.com/pytorch/pytorch/blob/main/SECURITY.md
+
-This document mostly references the recommendation from PyTorch, thank you! 
+Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -3,6 +3,27 @@
 ## Downloading the ShareGPT dataset
 You can download the dataset by running:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 ## Downloading the ShareGPT4V dataset
 The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
 will ignore a datapoint if the referred image is missing.
 ```bash
 wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
 mkdir coco -p
 wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
 unzip coco/train2017.zip -d coco/
 ```
 # Downloading the BurstGPT dataset
 You can download the BurstGPT v1.1 dataset by running:
 ```bash
 wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
 ```
--- a/Show More
+++ b/Show More
`@ -1,2 +1,2 @@`
	`github: [vllm-project]`	`github: [vllm-project]`
	`open_collective: [vllm]`	`open_collective: vllm`