[Bugfix] Fix request cancellation without polling (#11190 )

[Misc] Allow passing logits_soft_cap for xformers backend (#11252 )
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-10-26 02:44:33 +08:00 · 2024-12-17 12:26:32 -08:00 · 2024-12-17 00:37:04 -08:00 · 2024-12-17 06:57:02 +00:00 · 2024-12-16 22:11:33 -08:00 · 2024-12-16 22:11:06 -08:00
1122 changed files with 107561 additions and 36023 deletions
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.764
+  - name: "exact_match,flexible-extract"
+    value: 0.764
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.356
+  - name: "exact_match,flexible-extract"
+    value: 0.358
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,6 +1,7 @@
 Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Minitron-4B-Base-FP8.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
 done

 lm_eval --model hf \
-  --model_args pretrained=$MODEL,parallelize=True \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
+  --model_args "pretrained=$MODEL,parallelize=True" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.3
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done

 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
 done

 # Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"

 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -49,10 +49,15 @@ def test_lm_eval_correctness():
    results = launch_lm_eval(eval_config)

    # Confirm scores match ground truth.
+    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(f'{task["name"]} | {metric["name"]}: '
                  f'ground_truth={ground_truth} | measured={measured_value}')
-            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and numpy.isclose(
+                ground_truth, measured_value, rtol=RTOL)
+
+    # Assert at the end, print all scores even on failure for debugging.
+    assert success
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -9,8 +9,11 @@ steps:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+
  - wait
+
  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    plugins:
@ -18,7 +21,7 @@ steps:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
@ -41,20 +44,48 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
-  # - label: "H100"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN

+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  - block: "Run H100 Benchmark"
+    key: block-h100
+    depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: block-h100
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -0,0 +1,28 @@
+
+## Description
+
+This file contains the downloading link for benchmarking results.
+
+- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
+- [benchmarking results](artifact://results.zip)
+- [benchmarking code](artifact://nightly-benchmarks.zip)
+
+Please download the visualization scripts in the post
+
+
+## Results reproduction
+
+- Find the docker we use in `benchmarking pipeline`
+- Deploy the docker, and inside the docker:
+  - Download `nightly-benchmarks.zip`. 
+  - In the same folder, run the following code
+```
+export HF_TOKEN=<your HF token>
+apt update
+apt install -y git
+unzip nightly-benchmarks.zip
+VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+```
+
+And the results will be inside `./benchmarks/results`.
+
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,45 +1,39 @@

 # Nightly benchmark

-The main goal of this benchmarking is two-fold:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
+This benchmark aims to:
+- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
+- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
+
+Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
+
+Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)


-## Docker images
+## Setup

-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1
-
-<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
-
-
-## Hardware
-
-One AWS node with 8x NVIDIA A100 GPUs.
-
-
-## Workload description
-
-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
-
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 500 prompts.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Docker images:
+  - vLLM: `vllm/vllm-openai:v0.6.2`
+  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+- Hardware
+  - 8x Nvidia A100 GPUs
+- Workload:
+  - Dataset
+    - ShareGPT dataset
+    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+  - Models: llama-3 8B, llama-3 70B.
+    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).

-<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
+# Known issues

-## Plots
-
-In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
-
-<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
-
-## Results
-
-{nightly_results_benchmarking_table}
+- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
+- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec

 common_container_settings: &common_container_settings
  command:
-    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  resources:
    limits:
      nvidia.com/gpu: 8
@ -37,7 +37,10 @@ common_container_settings: &common_container_settings

 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-  - label: "A100 trt benchmark"
+
+
+
+  - label: "A100 vllm step 10"
    priority: 100
    agents:
      queue: A100
@ -46,7 +49,21 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+              - image: vllm/vllm-openai:v0.6.2
+                <<: *common_container_settings
+
+
+
+  - label: "A100 sglang benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: lmsysorg/sglang:v0.3.2-cu121
                <<: *common_container_settings

  - label: "A100 lmdeploy benchmark"
@ -58,11 +75,13 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: openmmlab/lmdeploy:v0.5.0
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
                <<: *common_container_settings


-  - label: "A100 vllm benchmark"
+
+
+  - label: "A100 trt llama-8B"
    priority: 100
    agents:
      queue: A100
@ -71,10 +90,25 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: vllm/vllm-openai:latest 
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama8B"

-  - label: "A100 tgi benchmark"
+
+  - label: "A100 trt llama-70B"
    priority: 100
    agents:
      queue: A100
@ -83,12 +117,54 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: ghcr.io/huggingface/text-generation-inference:2.1 
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama70B"
+
+
+  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  #               <<: *common_container_settings
+
+
+  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
+  #               <<: *common_container_settings
        
  - wait

-  - label: "Plot"
+  - label: "Collect the results"
    priority: 100
    agents:
      queue: A100
@ -117,4 +193,4 @@ steps:
                    name: hf-token-secret
                    key: token

-  - wait
+  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@ -1,76 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-    if [[ $gpu_count -gt 0 ]]; then
-        echo "GPU found."
-    else
-        echo "Need at least 1 GPU to run benchmarking."
-        exit 1
-    fi
-    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-    echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-    # check if HF_TOKEN is available and valid
-    if [[ -z "$HF_TOKEN" ]]; then
-        echo "Error: HF_TOKEN is not set."
-        exit 1
-    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-        echo "Error: HF_TOKEN does not start with 'hf_'."
-        exit 1
-    else
-        echo "HF_TOKEN is set and valid."
-    fi
-}
-
-main() {
-
-    check_gpus
-    check_hf_token
-
-    df -h
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-    
-
-    # run lmdeploy
-    if which lmdeploy >/dev/null; then
-        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
-        exit 0
-    fi
-
-    # run tgi
-    if [ -e /tgi-entrypoint.sh ]; then
-        echo "tgi is available, redirect to run-tgi-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
-        exit 0
-    fi
-
-    # run trt
-    if which trtllm-build >/dev/null; then
-        echo "trtllm is available, redirect to run-trt-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
-        exit 0
-    fi
-
-    # run vllm
-    if [ -e /vllm-workspace ]; then
-        echo "vllm is available, redirect to run-vllm-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
-        exit 0
-    fi
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -56,7 +56,7 @@ serving_column_mapping = {

 def read_markdown(file):
    if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
@ -75,14 +75,14 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):

-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())

        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -97,7 +97,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -119,7 +119,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -157,6 +157,18 @@ if __name__ == "__main__":
                                             throughput_results,
                                             serving_results)

+    for df in [latency_results, serving_results, throughput_results]:
+        if df.empty:
+            continue
+
+        # Sort all dataframes by their respective "Test name" columns
+        df.sort_values(by="Test name", inplace=True)
+
+        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
+        # we want to turn it into "8xGPUTYPE"
+        df["GPU"] = df["GPU"].apply(
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+
    # get markdown tables
    latency_md_table = tabulate(latency_results,
                                headers='keys',
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -0,0 +1,95 @@
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def get_perf(df, method, model, metric):
+
+    means = []
+
+    for qps in [2, 4, 8, 16, "inf"]:
+        target = df['Test name'].str.contains(model)
+        target = target & df['Engine'].str.contains(method)
+        target = target & df['Test name'].str.contains("qps_" + str(qps))
+        filtered_df = df[target]
+
+        if filtered_df.empty:
+            means.append(0.)
+        else:
+            means.append(filtered_df[metric].values[0])
+
+    return np.array(means)
+
+
+def get_perf_w_std(df, method, model, metric):
+
+    if metric in ["TTFT", "ITL"]:
+        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
+        mean = mean.tolist()
+        std = get_perf(df, method, model, "Std " + metric + " (ms)")
+        if std.mean() == 0:
+            std = None
+        success = get_perf(df, method, model, "Successful req.")
+        if std is not None:
+            std = std / np.sqrt(success)
+            std = std.tolist()
+
+    else:
+        assert metric == "Tput"
+        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
+            df, method, model, "Output Tput (tok/s)")
+        mean = mean.tolist()
+        std = None
+
+    return mean, std
+
+
+def main(args):
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file) as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description) as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -0,0 +1,228 @@
+#!/bin/bash
+
+# Currently FP8 benchmark is NOT enabled.
+
+set -x
+server_params=$1
+common_params=$2
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+launch_trt_server() {
+
+  model_path=$(echo "$common_params" | jq -r '.model')
+  model_name="${model_path#*/}"
+  model_type=$(echo "$server_params" | jq -r '.model_type')
+  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+  model_tp_size=$(echo "$common_params" | jq -r '.tp')
+  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
+  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
+  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+  # create model caching directory
+  cd ~
+  rm -rf models
+  mkdir -p models
+  cd models
+  models_dir=$(pwd)
+  trt_model_path=${models_dir}/${model_name}-trt-ckpt
+  trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+  # clone tensorrt backend
+  cd /
+  rm -rf tensorrtllm_backend
+  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+  git lfs install
+  cd tensorrtllm_backend
+  git checkout "$trt_llm_version"
+  git submodule update --init --recursive
+
+  # build trtllm engine
+  cd /tensorrtllm_backend
+  cd "./tensorrt_llm/examples/${model_type}"
+  python3 convert_checkpoint.py \
+    --model_dir "${model_path}" \
+    --dtype "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --output_dir "${trt_model_path}"
+  trtllm-build \
+    --checkpoint_dir "${trt_model_path}" \
+    --use_fused_mlp \
+    --reduce_fusion disable \
+    --workers 8 \
+    --gpt_attention_plugin "${model_dtype}" \
+    --gemm_plugin "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --max_batch_size "${max_batch_size}" \
+    --max_input_len "${max_input_len}" \
+    --max_seq_len "${max_seq_len}" \
+    --max_num_tokens "${max_num_tokens}" \
+    --output_dir "${trt_engine_path}"
+
+  # handle triton protobuf files and launch triton server
+  cd /tensorrtllm_backend
+  mkdir triton_model_repo
+  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
+  cd triton_model_repo
+  rm -rf ./tensorrt_llm/1/*
+  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
+  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
+  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
+  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
+  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
+  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
+  cd /tensorrtllm_backend
+  python3 scripts/launch_triton_server.py \
+    --world_size="${model_tp_size}" \
+    --model_repo=/tensorrtllm_backend/triton_model_repo &
+
+}
+
+launch_tgi_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                --quantize fp8 \
+                $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                $server_args"
+  fi
+
+  echo "Server command: $server_command"
+  eval "$server_command" &
+
+}
+
+launch_lmdeploy_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  server_command="lmdeploy serve api_server $model \
+    --tp $tp \
+    --server-port $port \
+    $server_args"
+
+  # run the server
+  echo "Server command: $server_command"
+  bash -c "$server_command" &
+}
+
+launch_sglang_server() {
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+launch_vllm_server() {
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+main() {
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
+    launch_trt_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
+    launch_tgi_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+    launch_lmdeploy_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
+    launch_sglang_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
+    launch_vllm_server
+  fi
+}
+
+main
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@ -1,102 +0,0 @@
-#!/bin/bash
-
-
-server_params=$1
-common_params=$2
-
-
-
-model_path=$(echo "$common_params" | jq -r '.model')
-model_name="${model_path#*/}"
-model_type=$(echo "$server_params" | jq -r '.model_type')
-model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
-model_tp_size=$(echo "$common_params" | jq -r '.tp')
-max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
-max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
-max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
-trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
-
-cd ~
-rm -rf models
-mkdir -p models
-cd models
-models_dir=$(pwd)
-trt_model_path=${models_dir}/${model_name}-trt-ckpt
-trt_engine_path=${models_dir}/${model_name}-trt-engine
-
-cd ~
-rm -rf tensorrt-demo
-git clone https://github.com/neuralmagic/tensorrt-demo.git
-cd tensorrt-demo
-tensorrt_demo_dir=$(pwd)
-
-# make sure the parameter inside tensorrt_demo is consistent to envvar
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
-
-
-cd /
-rm -rf tensorrtllm_backend
-git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-git lfs install
-cd tensorrtllm_backend
-git checkout $trt_llm_version
-tensorrtllm_backend_dir=$(pwd)
-git submodule update --init --recursive
-cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
-
-cd /tensorrtllm_backend
-cd ./tensorrt_llm/examples/${model_type}
-
-
-if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-
-    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
-    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
-    python ../quantization/quantize.py \
-        --model_dir ${model_path} \
-        --dtype ${model_dtype} \
-        --tp_size ${model_tp_size} \
-        --output_dir ${trt_model_path} \
-        --qformat fp8 \
-        --kv_cache_dtype fp8 \
-        --calib_size 2
-
-else
-
-    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
-    python3 convert_checkpoint.py \
-        --model_dir ${model_path} \
-        --dtype ${model_dtype} \
-        --tp_size ${model_tp_size} \
-        --output_dir ${trt_model_path}
-
-fi
-
-
-
-trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path} 
-
-cd /tensorrtllm_backend/triton_model_repo
-rm -rf ./tensorrt_llm/1/*
-cp -r ${trt_engine_path}/* ./tensorrt_llm/1
-cd /tensorrtllm_backend
-python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -8,6 +8,7 @@ main() {

    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
+    (which zip) || (apt-get install -y zip)

    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
@ -15,26 +16,63 @@ main() {
    fi

    # initial annotation
-    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"

    # download results
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/

-    # generate figures
-    python3 -m pip install tabulate pandas matplotlib
-    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-        --description $description \
-        --results-folder results/
+    # upload benchmark results
+    zip -r results.zip results/
+    /workspace/buildkite-agent artifact upload "results.zip"

-    # upload results and figures
-    /workspace/buildkite-agent artifact upload "nightly_results.png"
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+    # upload benchmarking scripts
+    cd "$VLLM_SOURCE_CODE_LOC/"
+    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
+    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    # upload benchmarking pipeline
+    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
+    
+
+
+    # The figures should be genereated by a separate process outside the CI/CD pipeline
+
+    # # generate figures
+    # python3 -m pip install tabulate pandas matplotlib
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
+    #     --description $description \
+    #     --results-folder results/ 
+
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sharegpt
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_2048_128
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_128_2048
+    
+    # # upload results and figures
+    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }

 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@ -1,135 +0,0 @@
-import argparse
-import json
-import math
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description=
-        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder',
-                        type=str,
-                        required=True,
-                        help='The folder where the results are stored.')
-    parser.add_argument('--description',
-                        type=str,
-                        required=True,
-                        help='Description of the results.')
-
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-
-    with open(args.description, "r") as f:
-        description = f.read()
-
-    description = description.format(
-        nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-    plt.rcParams.update({'font.size': 20})
-
-    # plot results
-    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
-    fig.subplots_adjust(hspace=1)
-    methods = ["vllm", "trt", "lmdeploy", "tgi"]
-    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
-        for j, metric in enumerate(["TTFT", "ITL"]):
-            means, stds = [], []
-            for method in methods:
-                target = df['Test name'].str.contains(model)
-                target = target & df['Engine'].str.contains(method)
-                filtered_df = df[target]
-
-                if filtered_df.empty:
-                    means.append(0.)
-                    stds.append(0.)
-                else:
-                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
-                    std = filtered_df[f"Std {metric} (ms)"].values[0]
-                    success = filtered_df["Successful req."].values[0]
-                    stds.append(std / math.sqrt(success))
-
-            print(model, metric)
-            print(means, stds)
-
-            ax = axes[i, j + 1]
-
-            bars = ax.bar(
-                ["vllm", "trt", "lmdeploy", "tgi"],
-                means,
-                yerr=stds,
-                capsize=10,
-            )
-            for idx, bar in enumerate(bars):
-                bar.set_color(bar_colors[idx])
-            ax.set_ylim(bottom=0)
-
-            ax.set_ylabel(f"{metric} (ms)")
-            ax.set_title(f"{model} {metric}")
-            ax.grid(axis='y')
-
-        metric = "Tput"
-        j = 0
-        if True:
-            tputs = []
-            for method in methods:
-                target = df['Test name'].str.contains(model)
-                target = target & df['Engine'].str.contains(method)
-                filtered_df = df[target]
-
-                if filtered_df.empty:
-                    tputs.append(0.)
-                else:
-                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
-                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
-                    tputs.append(input_tput + output_tput)
-
-            print(model, metric)
-            print(tputs)
-
-            ax = axes[i, j]
-
-            bars = ax.bar(
-                ["vllm", "trt", "lmdeploy", "tgi"],
-                tputs,
-            )
-            for idx, bar in enumerate(bars):
-                bar.set_color(bar_colors[idx])
-
-            ax.set_ylim(bottom=0)
-
-            ax.set_ylabel("Tput (token/s)")
-            ax.set_title(f"{model} {metric}")
-            ax.grid(axis='y')
-
-    fig.tight_layout()
-    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@ -1,218 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill lmdeploy || true
-  # waiting for GPU processes to be fully killed
-  sleep 10
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-    
-    # append lmdeploy to the test name
-    test_name=lmdeploy_$test_name
-    
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
-    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    # prepare tokenizer
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-
-    server_command="lmdeploy serve api_server $model \
-      --tp $tp \
-      --server-port $port \
-      $server_args"
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    bash -c "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "lmdeploy server is up and running."
-    else
-      echo ""
-      echo "lmdeploy failed to start within the timeout period."
-      break
-    fi
-
-    # get model name
-    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend lmdeploy \
-        --tokenizer /tokenizer_cache \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        --model \"$model_name\" \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "lmdeploy" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  python -m pip install transformers==4.41.2
-
-  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -0,0 +1,355 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+get_current_llm_serving_engine() {
+
+  if which lmdeploy >/dev/null; then
+    echo "Container: lmdeploy"
+    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+    return
+  fi
+
+  if [ -e /tgi-entrypoint.sh ]; then
+    echo "Container: tgi"
+    export CURRENT_LLM_SERVING_ENGINE=tgi
+    return
+  fi
+
+  if which trtllm-build >/dev/null; then
+    echo "Container: tensorrt-llm"
+    export CURRENT_LLM_SERVING_ENGINE=trt
+    return
+  fi
+
+  if [ -e /sgl-workspace ]; then
+    echo "Container: sglang"
+    export CURRENT_LLM_SERVING_ENGINE=sglang
+    return
+  fi
+
+  if [ -e /vllm-workspace ]; then
+    echo "Container: vllm"
+    # move to a completely irrelevant directory, to avoid import vllm from current folder
+    export CURRENT_LLM_SERVING_ENGINE=vllm
+    
+    return
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+kill_gpu_processes() {
+  pkill -f python
+  pkill -f python3
+  pkill -f tritonserver
+  pkill -f pt_main_thread
+  pkill -f text-generation
+  pkill -f lmdeploy
+
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+ensure_installed() {
+  # Ensure that the given command is installed by apt-get
+  local cmd=$1
+  if ! which "$cmd" >/dev/null; then
+    apt-get update && apt-get install -y "$cmd"
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    # this is required for lmdeploy.
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+
+
+    # change model name for lmdeploy (it will not follow standard hf name)
+    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      backend=$CURRENT_LLM_SERVING_ENGINE
+
+      if [[ $backend = "trt" ]]; then
+        backend="tensorrt-llm"
+      fi
+
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+
+      if [[ "$dataset_name" = "sharegpt" ]]; then
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      elif [[ "$dataset_name" = "sonnet" ]]; then
+
+        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
+        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
+        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --sonnet-input-len $sonnet_input_len \
+          --sonnet-output-len $sonnet_output_len \
+          --sonnet-prefix-len $sonnet_prefix_len \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      else
+  
+        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
+        exit 1
+
+      fi
+
+        
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command="None"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+  done
+
+  kill_gpu_processes
+}
+
+
+prepare_dataset() {
+
+  # download sharegpt dataset
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  
+}
+
+main() {
+
+  # check if the environment variable is successfully injected from yaml
+
+  check_gpus
+  check_hf_token
+  get_current_llm_serving_engine
+
+  pip install -U transformers
+
+  # check storage
+  df -h
+
+  ensure_installed wget
+  ensure_installed curl
+  ensure_installed jq
+
+  prepare_dataset
+
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+
+  # run the test
+  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
+
+  # upload benchmark results to buildkite
+  python3 -m pip install tabulate pandas
+  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -6,6 +6,7 @@

 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
+set -x
 set -o pipefail

 check_gpus() {
@ -17,7 +18,7 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }

@ -85,15 +86,11 @@ kill_gpu_processes() {

  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
-  # this line doesn't work now
-  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
-  pkill -f python3
-  pkill -f /usr/bin/python3
+  pgrep python3 | xargs -r kill -9


  # wait until GPU memory usage smaller than 1GB
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
  done

@ -117,7 +114,7 @@ upload_to_buildkite() {
  fi

  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

@ -150,7 +147,7 @@ run_latency_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi

@ -206,9 +203,9 @@ run_throughput_tests() {
    throughput_args=$(json2args "$throughput_params")

    # check if there is enough GPU to run the test
-    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi

@ -270,7 +267,7 @@ run_serving_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi

@ -278,7 +275,7 @@ run_serving_tests() {
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $testname."
+      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi

@ -289,12 +286,11 @@ run_serving_tests() {
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
-    eval "$server_command" &
+    bash -c "$server_command" &
    server_pid=$!

    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
+    if wait_for_server; then
      echo ""
      echo "vllm server is up and running."
    else
@ -323,7 +319,7 @@ run_serving_tests() {
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"

-      eval "$client_command"
+      bash -c "$client_command"

      # record the benchmarking commands
      jq_output=$(jq -n \
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@ -1,216 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill text-generation || true
-  # waiting for GPU processes to be fully killed
-  sleep 10
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/generate_stream > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append tgi to the test name
-    test_name=tgi_$test_name
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
-    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-      echo "Key 'fp8' exists in common params."
-      server_command="/tgi-entrypoint.sh \
-        --model-id $model \
-        --num-shard $tp \
-        --port $port \
-        --quantize fp8 \
-        $server_args"
-    else
-      echo "Key 'fp8' does not exist in common params."
-      server_command="/tgi-entrypoint.sh \
-        --model-id $model \
-        --num-shard $tp \
-        --port $port \
-        $server_args"
-    fi
-
-
-    
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "tgi server is up and running."
-    else
-      echo ""
-      echo "tgi failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend tgi \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "tgi" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  export CURRENT_LLM_SERVING_ENGINE=tgi
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@ -1,214 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill tritonserver || true
-  # waiting for GPU processes to be fully killed
-  sleep 20
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/generate_stream > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append trt to the test name
-    test_name=trt_$test_name
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
-    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-
-
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-
-    echo "Running test case $test_name"
-    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "trt server is up and running."
-    else
-      echo ""
-      echo "trt failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend tensorrt-llm \
-        --tokenizer /tokenizer_cache \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command=""
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "trt" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-main() {
-
-  check_gpus
-
-
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  # update transformers package, to make sure mixtral tokenizer is available
-  python -m pip install transformers -U
-
-  export CURRENT_LLM_SERVING_ENGINE=trt
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@ -1,221 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pkill pt_main_thread
-  sleep 10
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append vllm to the test name
-    test_name=vllm_$test_name
-
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
-    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
-      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-      server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-    else
-      echo "Key 'fp8' does not exist in common params."
-      server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-    fi
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "vllm server is up and running."
-    else
-      echo ""
-      echo "vllm failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend vllm \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "vllm" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  export CURRENT_LLM_SERVING_ENGINE=vllm
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-
-  python3 -m pip install tabulate pandas
-  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -17,10 +17,17 @@ serving_column_mapping = {
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
-    "input_throughput": "Input Tput (tok/s)",
+    "median_itl_ms": "Median ITL (ms)",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "std_tpot_ms": "Std TPOT (ms)",
+    "median_tpot_ms": "Median TPOT (ms)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }

@ -29,11 +36,11 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):

-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())

        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)

--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,12 +1,12 @@
 #!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"

 TIMEOUT_SECONDS=10

 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
        exit 0
    fi

--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -1,16 +1,18 @@
 [
    {
-        "test_name": "llama8B_tp1",
-        "qps_list": [4],
+        "test_name": "llama8B_tp1_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
+            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -21,34 +23,158 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "llama70B_tp4",
-        "qps_list": [2],
+        "test_name": "llama8B_tp1_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
+            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -59,34 +185,50 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "mixtral8x7B_tp2",
-        "qps_list": [2],
+        "test_name": "llama70B_tp4_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tp": 2,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -97,20 +239,85 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,32 +1,57 @@
 steps:
  - label: "Build wheel - CUDA 12.1"
    agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build CUDA 11.8 wheel"
-    key: block-build-cu118-wheel
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+  # However, this block can be uncommented to save some compute hours.
+  # - block: "Build CUDA 11.8 wheel"
+  #   key: block-build-cu118-wheel

  - label: "Build wheel - CUDA 11.8"
-    depends_on: block-build-cu118-wheel
+    # depends_on: block-build-cu118-wheel
    agents:
-      queue: cpu_queue
+      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build release image"
+    depends_on: ~
+    key: block-release-image-build
+
+  - label: "Build release image"
+    depends_on: block-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build and publish TPU release image"
+    depends_on: ~
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: tpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "docker push vllm/vllm-tpu:nightly"
+      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllm
+          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail

@ -31,8 +33,8 @@ cleanup_docker() {
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
-    # Remove unused volumes
-    docker volume prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
@ -57,17 +59,17 @@ done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull ${image_name}
+docker pull "${image_name}"

 remove_docker_container() {
-   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p ${HF_CACHE}
+mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

 commands=$@
@ -83,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_encoder_decoder_attn.py \
  --ignore=kernels/test_flash_attn.py \
  --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_gguf.py \
  --ignore=kernels/test_int8_quant.py \
  --ignore=kernels/test_machete_gemm.py \
  --ignore=kernels/test_mamba_ssm.py \
@ -107,35 +108,36 @@ fi
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    #replace shard arguments
-    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+  # assign job count as the number of shards used   
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-    echo "Shard ${GPU} commands:$commands"
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    echo "Shard ${GPU} commands:$commands_gpu"
    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
-        -e HIP_VISIBLE_DEVICES=${GPU} \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
-        -v ${HF_CACHE}:${HF_MOUNT} \
-        -e HF_HOME=${HF_MOUNT} \
-        --name ${container_name}_${GPU}  \
-        ${image_name} \
-        /bin/bash -c "${commands}" \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done
  #wait for all processes to finish and collect exit codes
-  for pid in ${PIDS[@]}; do
-    wait ${pid}
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
    STATUS+=($?)
  done
-  for st in ${STATUS[@]}; do
+  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]]; then
      echo "One of the processes failed with $st"
-      exit ${st}
+      exit "${st}"
    fi
  done
 else
@ -146,9 +148,9 @@ else
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
-          -v ${HF_CACHE}:${HF_MOUNT} \
-          -e HF_HOME=${HF_MOUNT} \
-          --name ${container_name} \
-          ${image_name} \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          --name "${container_name}" \
+          "${image_name}" \
          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite

 set -ex
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@ -1,33 +1,14 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex

-# Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
-
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Run the image, setting --shm-size=4g for tensor parallel.
-source /etc/environment
-#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.ppc64le .

-# Run basic model test
-docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
-
-# online inference
-docker exec cpu-test bash -c "
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
-  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-  python3 benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --dataset-name random \
-    --model facebook/opt-125m \
-    --num-prompts 20 \
-    --endpoint /v1/completions \
-    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -1,44 +1,75 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex

+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
 # Try building the docker image
-numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2

  # offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+  docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
+    set -e
+    python3 examples/offline_inference.py"

  # Run basic model test
-docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
-  pytest -v -s tests/models/decoder_only/language \
-    --ignore=tests/models/test_fp8.py \
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
-    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip install pytest pytest-asyncio \
+      decord einops librosa peft Pillow sentence-transformers soundfile \
+      transformers_stream_generator matplotlib datamodel_code_generator
+    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

  # Run compressed-tensor test
-docker exec cpu-test bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+
+  # Run AWQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_ipex_quant.py"
+
+  # Run chunked-prefill and prefix-cache test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v -k cpu_model \
+    tests/basic_correctness/test_chunked_prefill.py"  

  # online inference
-docker exec cpu-test bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
    export VLLM_CPU_KVCACHE_SPACE=10 
-  export VLLM_CPU_OMP_THREADS_BIND=48-92 
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+    export VLLM_CPU_OMP_THREADS_BIND=$1
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
    python3 benchmarks/benchmark_serving.py \
      --backend vllm \
@ -47,3 +78,8 @@ docker exec cpu-test bash -c "
      --num-prompts 20 \
      --endpoint /v1/completions \
      --tokenizer facebook/opt-125m"
+}
+
+# All of CPU tests are expected to be finished less than 25 mins.
+export -f cpu_tests
+timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference.py
+'
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t hpu-test-env -f Dockerfile.hpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f hpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@ -14,7 +14,7 @@ DOCKER_IMAGE=$4

 shift 4
 COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
@ -23,7 +23,7 @@ fi

 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
-    echo $command
+    echo "$command"
 done

 start_network() {
@ -36,7 +36,7 @@ start_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
@ -49,17 +49,20 @@ start_nodes() {
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
+            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
+            /bin/bash -c "tail -f /dev/null"

        # organize containers into a ray cluster
-        if [ $node -eq 0 ]; then
+        if [ "$node" -eq 0 ]; then
            # start the ray head node
-            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
-            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
        fi
    done

@ -79,22 +82,22 @@ run_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -ne 0 ]; then
-            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        if [ "$node" -ne 0 ]; then
+            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        else
-            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        fi
    done
 }
 cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
-        docker stop node$node
+        docker stop "node$node"
    done
    docker network rm docker-net
 }
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
        docker system prune -f
-        echo $current_time > /tmp/neuron-docker-build-timestamp
+        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
-    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

 docker build -t neuron -f Dockerfile.neuron .
@ -34,7 +36,7 @@ wait_for_server_to_start() {
    timeout=300
    counter=0

-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
        sleep 1
        counter=$((counter + 1))
        if [ $counter -ge $timeout ]; then
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the OpenVINO docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -11,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e

 # Build the docker image.
@ -12,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -10,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
+# Run the image and test offline inference/tensor parallel
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+    python3 examples/offline_inference.py
+    python3 examples/offline_inference_cli.py -tp 2
+'
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -9,6 +9,7 @@
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@ -39,7 +40,7 @@ steps:
  # Check API reference (if it fails, you may have missing mock imports)
  - grep \"sig sig-object py\" build/html/dev/sampling_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 15min
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
  fast_check: true
  source_file_dependencies:
  - vllm/
@ -49,7 +50,9 @@ steps:
  - tests/multimodal
  - tests/test_utils
  - tests/worker
+  - tests/standalone_tests/lazy_torch_compile.py
  commands:
+  - python3 standalone_tests/lazy_torch_compile.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@ -58,18 +61,33 @@ steps:
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker

+- label: Python-only Installation Test
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
 - label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
  source_file_dependencies:
  - vllm/
-  - tests/basic_correctness
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_preemption
  commands:
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Chunked Prefill Test
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_chunked_prefill
+  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

 - label: Core Test # 10min
  mirror_hardwares: [amd]
@ -81,7 +99,7 @@ steps:
  commands:
  - pytest -v -s core

- label: Entrypoints Test # 20min
+- label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  mirror_hardwares: [amd]
@ -89,13 +107,13 @@ steps:
  - vllm/
  commands:
  - pip install -e ./plugins/vllm_add_dummy_model
-  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s entrypoints/test_chat_utils.py
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -108,7 +126,10 @@ steps:
  - vllm/core/
  - tests/distributed
  - tests/spec_decode/e2e/test_integration_dist_tp4
+  - tests/compile
  commands:
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py

@ -136,7 +157,9 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/test_regression
-  command: pytest -v -s test_regression.py
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

 - label: Engine Test # 10min
@ -150,14 +173,22 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: Examples Test # 12min
+- label: V1 Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - VLLM_USE_V1=1 pytest -v -s v1
+
+- label: Examples Test # 25min
  working_dir: "/vllm-workspace/examples"
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
  commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+    - pip install tensorizer # for tensorizer test
    - python3 offline_inference.py
    - python3 cpu_offload.py
    - python3 offline_inference_chat.py
@ -167,16 +198,20 @@ steps:
    - python3 offline_inference_vision_language_multi_image.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py
+    - python3 offline_inference_classification.py
+    - python3 offline_inference_embedding.py
+    - python3 offline_inference_scoring.py
+    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2

- label: Prefix Caching Test # 7min
-  #mirror_hardwares: [amd]
+- label: Prefix Caching Test # 9min
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
  commands:
    - pytest -v -s prefix_caching

- label: Samplers Test # 18min
+- label: Samplers Test # 36min
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
@ -192,40 +227,41 @@ steps:
  - tests/test_logits_processor
  command: pytest -v -s test_logits_processor.py

- label: Speculative decoding tests # 22min
+- label: Speculative decoding tests # 30min
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
  commands:
-    # See https://github.com/vllm-project/vllm/issues/5152
-    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

- label: LoRA Test %N # 30min each
+- label: LoRA Test %N # 15min each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4

- label: "PyTorch Fullgraph Smoke Test"
+- label: "PyTorch Fullgraph Smoke Test" # 9min
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_full_graph_smoke.py
+  - pytest -v -s compile/test_basic_correctness.py
+  # these tests need to be separated, cannot combine
+  - pytest -v -s compile/piecewise/test_simple.py
+  - pytest -v -s compile/piecewise/test_toy_llama.py

- label: "PyTorch Fullgraph Test"
+- label: "PyTorch Fullgraph Test" # 18min
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 30min each
+- label: Kernels Test %N # 1h each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
@ -252,15 +288,14 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - pip install aiohttp
  - bash run-benchmarks.sh

- label: Quantization Test # 15min
+- label: Quantization Test # 33min
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  command: pytest -v -s quantization
+  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@ -268,7 +303,6 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1

@ -290,42 +324,91 @@ steps:

 #####  models test  #####

- label: Basic Models Test # 3min
+- label: Basic Models Test # 24min
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pip install -e ./plugins/vllm_add_dummy_model
    - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_initialization.py

- label: Decoder-only Language Models Test # 1h3min
+- label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
  commands:
-    - pytest -v -s models/decoder_only/language
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/language -m core_model

- label: Decoder-only Multi-Modal Models Test # 56min
+- label: Language Models Test (Extended) # 1h10min
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/language -m 'not core_model'
+
+- label: Multi-Modal Models Test (Standard) # 28min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
  commands:
-    - pytest -v -s models/decoder_only/audio_language
-    - pytest -v -s models/decoder_only/vision_language
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/language -m core_model
+    - pytest -v -s models/encoder_decoder/vision_language -m core_model

- label: Other Models Test # 5min
-  #mirror_hardwares: [amd]
+- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
  commands:
-    - pytest -v -s models/embedding/language
-    - pytest -v -s models/encoder_decoder/language
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
+
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -352,13 +435,13 @@ steps:
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'

- label: Distributed Tests (2 GPUs) # 28min
+- label: Distributed Tests (2 GPUs) # 40min
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -369,20 +452,25 @@ steps:
  - vllm/model_executor/models/
  - tests/distributed/
  - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
  commands:
-  - pytest -v -s ./compile/test_full_graph_multi_gpu.py
+  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py

- label: Multi-step Tests (4 GPUs) # 21min
+- label: Multi-step Tests (4 GPUs) # 36min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -400,7 +488,7 @@ steps:
  - pytest -v -s multi_step/test_correctness_async_llm.py
  - pytest -v -s multi_step/test_correctness_llm.py

- label: Pipeline Parallelism Test # 23min
+- label: Pipeline Parallelism Test # 45min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -413,20 +501,24 @@ steps:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py

- label: LoRA Long Context (Distributed) # 11min
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
+- label: LoRA TP Test (Distributed)
  num_gpus: 4
-  soft_fail: true
  source_file_dependencies:
  - vllm/lora
-  - tests/lora/test_long_context
+  - tests/lora
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # This test runs llama 13B, so it is required to run on 4 GPUs.
    - pytest -v -s -x lora/test_long_context.py
+    # There is some Tensor Parallelism related processing logic in LoRA that 
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py

- label: Weight Loading Multiple GPU Test
+
+- label: Weight Loading Multiple GPU Test  # 33min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -452,6 +544,7 @@ steps:

 - label: Distributed Tests (A100) # optional
  gpu: a100
+  optional: true
  num_gpus: 4
  source_file_dependencies:
  - vllm/
@ -459,17 +552,18 @@ steps:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

 - label: LM Eval Large Models # optional
  gpu: a100
+  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+  exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename
+new_wheel="${wheel/linux/manylinux1}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version: $version"
+
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    suffix="${version##*.}"
+    if [[ $suffix == cu* ]]; then
+        new_version="1.0.0.dev+${suffix}"
+    else
+        new_version="1.0.0.dev"
+    fi
+    new_wheel="${wheel/$version/$new_version}"
+    mv -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi
+
+# Upload the wheel to S3
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1,33 @@
-vllm/*.so
 /.venv
 /build
 dist
+vllm/*.so
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+.mypy_cache
+
+# Distribution / packaging
+.Python
+/build/
+cmake-build-*/
+CMakeUserPresets.json
+develop-eggs/
+/dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1,33 @@
+# See https://help.github.com/articles/about-codeowners/
+# for more info about CODEOWNERS file
+
+# This lists cover the "core" components of vLLM that require careful review
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+CMakeLists.txt @tlrmchlsmth
+
+# vLLM V1
+/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
+
+# Test ownership
+/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
+/tests/test_inputs.py @DarkLight1337 @ywang96
+/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
+/tests/models @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96
+/tests/prefix_caching @comaniac @KuntaiDu
+/tests/spec_decode @njhill @LiuXiaoxuanPKU
+/tests/kernels @tlrmchlsmth @WoosukKwon
+/tests/quantization @mgoin @robertgshaw2-neuralmagic
+/.buildkite/lm-eval-harness @mgoin @simon-mo
+/tests/distributed/test_multi_node_assignment.py @youkaichao
+/tests/distributed/test_pipeline_parallel.py @youkaichao
+/tests/distributed/test_same_node.py @youkaichao
+/tests/multi_step @alexm-neuralmagic @comaniac
+/tests/weight_loading @mgoin @youkaichao
+/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -1,2 +1,2 @@
 github: [vllm-project]
-open_collective: [vllm]
+open_collective: vllm
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -2,73 +2,4 @@ FILL IN THE PR DESCRIPTION HERE

 FIX #xxxx (*link existing issues this PR will resolve*)

-**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
-
---
-
-<details>
-<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
-<summary><b> PR Checklist (Click to Expand) </b></summary>
-
-<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
-
-<h3>PR Title and Classification</h3>
-<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
-<ul>
-    <li><code>[Bugfix]</code> for bug fixes.</li>
-    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
-    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
-    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
-    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
-    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
-    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
-    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
-    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
-</ul>
-<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
-
-<h3>Code Quality</h3>
-
-<p>The PR need to meet the following code quality standards:</p>
-
-<ul>
-    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
-    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
-    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
-    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
-    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
-</ul>
-
-<h3>Adding or changing kernels</h3>
-<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
-<ul>
-    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
-    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
-    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
-    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
-    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
-</ul>
-
-<h3>Notes for Large Changes</h3>
-<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
-
-<h3>What to Expect for the Reviews</h3>
-
-<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
-
-<ul>
-    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
-    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
-    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
-    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
- </li>
-</ul>
-
-<h3>Thank You</h3>
-
-<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
-
-
-</details>
-
-
+**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,31 @@
+version: 2
+updates:
+  # Maintain dependencies for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    labels: ["dependencies"]
+    open-pull-requests-limit: 5
+    reviewers: ["khluu", "simon-mo"]
+    allow:
+      - dependency-type: "all"
+    ignore:
+      - dependency-name: "*"
+        update-types: ["version-update:semver-patch"]
+      - dependency-name: "torch"
+      - dependency-name: "torchvision"
+      - dependency-name: "xformers"
+      - dependency-name: "lm-format-enforcer"
+      - dependency-name: "gguf"
+      - dependency-name: "compressed-tensors"
+      - dependency-name: "ray[adag]"
+      - dependency-name: "lm-eval"
+    groups:
+      minor-update:
+        applies-to: version-updates
+        update-types: ["minor"]
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -0,0 +1,60 @@
+pull_request_rules:
+- name: label-documentation
+  description: Automatically apply documentation label
+  conditions:
+    - or:
+      - files~=^[^/]+\.md$
+      - files~=^docs/
+  actions:
+    label:
+      add:
+        - documentation
+
+- name: label-ci-build
+  description: Automatically apply ci/build label
+  conditions:
+    - or:
+      - files~=^\.github/
+      - files~=\.buildkite/
+      - files~=^cmake/
+      - files=CMakeLists.txt
+      - files~=^Dockerfile
+      - files~=^requirements.*\.txt
+      - files=setup.py
+  actions:
+    label:
+      add:
+        - ci/build
+
+- name: label-frontend
+  description: Automatically apply frontend label
+  conditions:
+    - files~=^vllm/entrypoints/
+  actions:
+    label:
+      add:
+        - frontend
+
+- name: ping author on conflicts and add 'needs-rebase' label
+  conditions:
+      - conflict
+      - -closed
+  actions:
+    label:
+      add:
+        - needs-rebase
+    comment:
+      message: |
+       This pull request has merge conflicts that must be resolved before it can be
+       merged. Please rebase the PR, @{{author}}.
+
+       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+  conditions:
+      - -conflict
+      - -closed
+  actions:
+    label:
+      remove:
+        - needs-rebase
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+
+set -eu
+
+# ensure 1 argument is passed
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <pr_number>"
+    exit 1
+fi
+
+PR_NUMBER=$1
+OLD=/tmp/orig_pr_body.txt
+NEW=/tmp/new_pr_body.txt
+
+gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
+cp "${OLD}" "${NEW}"
+
+# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
+sed -i '/FIX #xxxx.*$/d' "${NEW}"
+
+# Remove "FILL IN THE PR DESCRIPTION HERE"
+sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
+
+# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
+python3 - <<EOF
+import re
+
+with open("${NEW}", "r") as file:
+    content = file.read()
+
+pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
+content = re.sub(pattern, '', content)
+
+with open("${NEW}", "w") as file:
+    file.write(content)
+EOF
+
+# Run this only if ${NEW} is different than ${OLD}
+if ! cmp -s "${OLD}" "${NEW}"; then
+    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+    echo
+    echo "Updated PR body:"
+    echo
+    cat "${NEW}"
+else
+    echo "No changes needed"
+fi
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@ -0,0 +1,40 @@
+name: Lint GitHub Actions workflows
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Run actionlint"
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+          tools/actionlint.sh -color
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -8,7 +8,7 @@ jobs:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
-                uses: actions/github-script@v5
+                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
                with:
                    script: |
                        github.rest.issues.addLabels({
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -6,9 +6,21 @@ on:
  push:
    branches:
      - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
  pull_request:
    branches:
      - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'

 jobs:
  clang-format:
@ -17,9 +29,9 @@ jobs:
      matrix:
        python-version: ["3.11"]
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -0,0 +1,26 @@
+name: Cleanup PR Body
+
+on:
+  pull_request_target:
+    types: [opened, reopened, edited]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  update-description:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: '3.12'
+
+      - name: Update PR description
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@ -0,0 +1,45 @@
+name: codespell
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+
+jobs:
+  codespell:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements-lint.txt
+    - name: Spelling check with codespell
+      run: |
+        codespell --toml pyproject.toml
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -0,0 +1,81 @@
+name: Lint and Deploy Charts
+
+on: pull_request
+
+jobs:
+  lint-and-deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Set up Helm
+        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
+        with:
+          version: v3.14.4
+
+       #Python is required because ct lint runs Yamale and yamllint which require Python.
+      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: '3.13'
+
+      - name: Set up chart-testing
+        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
+        with:
+          version: v3.10.1
+
+      - name: Run chart-testing (lint)
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+
+      - name: Setup minio
+        run: |
+          docker network create vllm-net
+          docker run -d -p 9000:9000 --name minio --net vllm-net \
+                     -e "MINIO_ACCESS_KEY=minioadmin" \
+                     -e "MINIO_SECRET_KEY=minioadmin" \
+                     -v /tmp/data:/data \
+                     -v /tmp/config:/root/.minio \
+                     minio/minio server /data
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          export AWS_EC2_METADATA_DISABLED=true
+          mkdir opt-125m
+          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
+          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
+          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
+
+      - name: Create kind cluster
+        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
+
+      - name: Build the Docker image vllm cpu
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+
+      - name: Configuration of docker images, network and namespace for the kind cluster
+        run: |
+          docker pull amazon/aws-cli:2.6.4
+          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
+          kind load docker-image vllm-cpu-env:latest --name chart-testing
+          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
+          kubectl create ns ns-vllm
+
+      - name: Run chart-testing (install)
+        run: |
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+    
+      - name: curl test
+        run: |
+          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
+          sleep 10
+          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
+                  --header "Content-Type: application/json" \
+                  --data '{
+                          "model": "opt-125m",
+                          "prompt": "San Francisco is a",
+                          "max_tokens": 7,
+                          "temperature": 0
+                  }'):$CODE"
+          echo "$CODE"
--- a/.github/workflows/matchers/actionlint.json
+++ b/.github/workflows/matchers/actionlint.json
@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "actionlint",
+      "pattern": [
+        {
+          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4,
+          "code": 5
+        }
+      ]
+    }
+  ]
+}
--- a/.github/workflows/matchers/mypy.json
+++ b/.github/workflows/matchers/mypy.json
@ -0,0 +1,16 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "mypy",
+      "pattern": [
+        {
+          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+          "file": 1,
+          "line": 2,
+          "severity": 3,
+          "message": 4
+        }
+      ]
+    }
+  ]
+}
--- a/.github/workflows/matchers/ruff.json
+++ b/.github/workflows/matchers/ruff.json
@ -0,0 +1,17 @@
+{
+    "problemMatcher": [
+      {
+        "owner": "ruff",
+        "pattern": [
+          {
+            "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
+            "file": 1,
+            "line": 2,
+            "column": 3,
+            "code": 4,
+            "message": 5
+          }
+        ]
+      }
+    ]
+  }
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -6,20 +6,35 @@ on:
  push:
    branches:
      - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
  pull_request:
    branches:
      - main
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    #paths:
+    #  - '**/*.py'
+    #  - '.github/workflows/mypy.yaml'
+    #  - 'tools/mypy.sh'
+    #  - 'pyproject.toml'

 jobs:
-  ruff:
+  mypy:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
@ -32,15 +47,5 @@ jobs:
        pip install types-setuptools
    - name: Mypy
      run: |
-        mypy
-        mypy tests --follow-imports skip
-        mypy vllm/attention --follow-imports skip
-        mypy vllm/distributed --follow-imports skip
-        mypy vllm/engine  --follow-imports skip
-        mypy vllm/executor --follow-imports skip
-        mypy vllm/lora --follow-imports skip
-        mypy vllm/model_executor  --follow-imports skip
-        mypy vllm/prompt_adapter --follow-imports skip
-        mypy vllm/spec_decode --follow-imports skip
-        mypy vllm/worker --follow-imports skip
-
+        echo "::add-matcher::.github/workflows/matchers/mypy.json"
+        tools/mypy.sh 1 ${{ matrix.python-version }}
--- a/.github/workflows/png-lint.yml
+++ b/.github/workflows/png-lint.yml
@ -0,0 +1,37 @@
+name: Lint PNG exports from excalidraw
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '*.excalidraw.png'
+      - '.github/workflows/png-lint.yml'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '*.excalidraw.png'
+      - '.github/workflows/png-lint.yml'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Run png-lint.sh to check excalidraw exported images"
+        run: |
+          tools/png-lint.sh
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -21,16 +21,16 @@ jobs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Extract branch info
        shell: bash
        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"

      - name: Create Release
        id: create_release
-        uses: "actions/github-script@v6"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
@ -48,16 +48,16 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@v1.2
+        uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
        with:
          create-symlink: true
          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
@ -68,7 +68,7 @@ jobs:
          bash -x .github/workflows/scripts/env.sh

      - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
        with:
            python-version: ${{ matrix.python-version }}

@ -86,13 +86,13 @@ jobs:
          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
        run: |
          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
          asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
-          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"

      - name: Upload Release Asset
-        uses: actions/upload-release-asset@v1
+        uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -8,7 +8,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
            github.rest.issues.createComment({
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -6,20 +6,37 @@ on:
  push:
    branches:
      - main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
  pull_request:
    branches:
      - main
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    #paths:
+    #  - "**/*.py"
+    #  - pyproject.toml
+    #  - requirements-lint.txt
+    #  - .github/workflows/matchers/ruff.json
+    #  - .github/workflows/ruff.yml

 jobs:
  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
@ -28,10 +45,8 @@ jobs:
          pip install -r requirements-lint.txt
      - name: Analysing the code with ruff
        run: |
-        ruff check .
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
      - name: Run isort
        run: |
          isort . --check-only
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -eux

 python_executable=python$1
 cuda_home=/usr/local/cuda-$2
@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH

 # Install requirements
-$python_executable -m pip install wheel packaging
-$python_executable -m pip install -r requirements-cuda.txt
+$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
+
+bash tools/check_repo.sh
+
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@ -1,16 +1,16 @@
 #!/bin/bash

 # Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
+cuda_version=$(echo "$1" | tr "." "-")
 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
+OS=$(echo "$2" | tr -d ".\-")

 # Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
 rm cuda-keyring_1.1-1_all.deb
 sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
 sudo apt clean

 # Test nvcc
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@ -6,7 +6,7 @@ cuda_version=$3

 # Install torch
 $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"

 # Print version information
 $python_executable --version
--- a/.github/workflows/shellcheck.yml
+++ b/.github/workflows/shellcheck.yml
@ -0,0 +1,37 @@
+name: Lint shell scripts
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  shellcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Check shell scripts"
+        run: |
+          tools/shellcheck.sh
--- a/.github/workflows/sphinx-lint.yml
+++ b/.github/workflows/sphinx-lint.yml
@ -0,0 +1,32 @@
+name: Lint documentation
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "docs/**"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "docs/**"
+
+jobs:
+  sphinx-lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Linting docs
+        run: tools/sphinx-lint.sh
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -0,0 +1,52 @@
+name: 'Close inactive issues and PRs'
+
+on:
+  schedule:
+    # Daily at 1:30 AM UTC
+    - cron: '30 1 * * *'
+
+jobs:
+  close-issues-and-pull-requests:
+    permissions:
+      issues: write
+      pull-requests: write
+      actions: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+        with:
+          # Increasing this value ensures that changes to this workflow
+          # propagate to all issues and PRs in days rather than months
+          operations-per-run: 1000
+
+          exempt-draft-pr: true
+          exempt-issue-labels: 'keep-open'
+          exempt-pr-labels: 'keep-open'
+
+          labels-to-add-when-unstale: 'unstale'
+          labels-to-remove-when-stale: 'unstale'
+
+          days-before-issue-stale: 90
+          days-before-issue-close: 30
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not
+            had any activity within 90 days. It will be automatically closed if no
+            further activity occurs within 30 days. Leave a comment if
+            you feel this issue should remain open. Thank you!
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity. Please
+            feel free to reopen if you feel it is still relevant. Thank you!
+
+          days-before-pr-stale: 90
+          days-before-pr-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it
+            has not had any activity within 90 days. It will be automatically
+            closed if no further activity occurs within 30 days. Leave a comment
+            if you feel this pull request should remain open. Thank you!
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it.
+            Thank you!
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -6,19 +6,26 @@ on:
  push:
    branches:
      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
  pull_request:
    branches:
      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
+
 jobs:
  yapf:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
--- a/.gitignore
+++ b/.gitignore
@ -33,6 +33,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+/.deps/

 # PyInstaller
 #  Usually these files are written by a python script from a template
@ -198,3 +199,7 @@ hip_compat.h

 # Benchmark dataset
 benchmarks/*.json
+
+# Linting
+actionlint
+shellcheck*/
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -6,15 +6,14 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.8"
+    python: "3.12"

 sphinx:
  configuration: docs/source/conf.py
  fail_on_warning: true

 # If using Sphinx, optionally build your docs in additional formats such as PDF
-formats:
-   - pdf
+formats: []

 # Optionally declare the Python requirements required to build your docs
 python:
--- a/.shellcheckrc
+++ b/.shellcheckrc
@ -0,0 +1,9 @@
+# rules currently disabled:
+#
+#   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
+#   SC2004 (style): $/${} is unnecessary on arithmetic variables.
+#   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
+#   SC2155 (warning): Declare and assign separately to avoid masking return values.
+#   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
+#
+disable=SC1091,SC2004,SC2129,SC2155,SC2164
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -31,13 +31,13 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")

 #
 # Try to find python package with an executable that exactly matches
@ -83,24 +83,6 @@ endif()
 #
 find_package(Torch REQUIRED)

-#
-message(STATUS "Enabling core extension.")
-
-# Define _core_C extension
-#  built for (almost) every target platform, (excludes TPU and Neuron)
-
-set(VLLM_EXT_SRC
-  "csrc/core/torch_bindings.cpp")
-
-define_gpu_extension_target(
-  _core_C
-  DESTINATION vllm
-  LANGUAGE CXX
-  SOURCES ${VLLM_EXT_SRC}
-  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-  USE_SABI 3
-  WITH_SOABI)
-
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -143,14 +125,32 @@ else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()

+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
-# Override the GPU architectures detected by cmake/torch and filter them by
-# the supported versions for the current language.
+  # For cuda we want to be able to control which architectures we compile for on
+  # a per-file basis in order to cut down on compile time. So here we extract
+  # the set of architectures we want to compile for and remove the from the
+  # CMAKE_CUDA_FLAGS so that they are not applied globally.
+  #
+  clear_cuda_arches(CUDA_ARCH_FLAGS)
+  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
+  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
+  # Filter the target architectures by the supported supported archs
+  # since for some files we will build for all CUDA_ARCHS.
+  cuda_archs_loose_intersection(CUDA_ARCHS
+    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
+  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+else()
+  #
+  # For other GPU targets override the GPU architectures detected by cmake/torch
+  # and filter them by the supported versions for the current language.
  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
  #
  override_gpu_arches(VLLM_GPU_ARCHES
    ${VLLM_GPU_LANG}
    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+endif()

 #
 # Query torch for additional GPU compilation flags for the given
@ -166,7 +166,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

+
+#
+# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
+# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
+# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
+# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
+#
 include(FetchContent)
+file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
+message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

 #
 # Define other extension targets
@ -178,15 +187,18 @@ include(FetchContent)

 set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
-  "csrc/attention/attention_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
+  "csrc/layernorm_quant_kernels.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
-  "csrc/moe_align_block_size_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/torch_bindings.cpp")

@ -196,6 +208,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")

+  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+    set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+  endif()
+
+  if(VLLM_CUTLASS_SRC_DIR)
+    if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+      get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+    endif()
+    message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+    FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+  else()
    FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
@ -207,6 +231,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
        GIT_SHALLOW TRUE
    )
+  endif()
  FetchContent_MakeAvailable(cutlass)

  list(APPEND VLLM_EXT_SRC
@ -214,30 +239,88 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/custom_all_reduce.cu"
+    "csrc/permute_cols.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_EXT_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
+  # Only build Marlin kernels if we are building for at least some compatible archs.
+  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
+  # are not supported by Machete yet.
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
+  if (MARLIN_ARCHS)
+    set(MARLIN_SRCS
+       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
-    "csrc/quantization/gguf/gguf_kernel.cu"
-    "csrc/quantization/fp8/fp8_marlin.cu"
-    "csrc/custom_all_reduce.cu"
-    "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_SRCS}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+  else()
+    message(STATUS "Not building Marlin kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()

  #
-  # The CUTLASS kernels for Hopper require sm90a to be enabled.
-  # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
-  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
-    set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
-          PROPERTIES
-          COMPILE_FLAGS
-          "-gencode arch=compute_90a,code=sm_90a")
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
+    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
+    # build any 3x kernels
+    set(SCALED_MM_3X_ARCHS)
+  endif()
+
+  #
+  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
+  # kernels for the remaining archs that are not already built for 3x.
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
+    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  # subtract out the archs that are already built for 3x
+  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
+  if (SCALED_MM_2X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
+    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
+  else()
+    if (SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
+                     " for and covered by scaled_mm_c3x")
+    else()
+      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
+                    "in CUDA target architectures")
+    endif()
  endif()


@ -245,15 +328,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Machete kernels

  # The machete kernels only work on hopper and require CUDA 12.0 or later.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+  # Only build Machete kernels if we are building for something compatible with sm90a
+  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
    #
    # For the Machete kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
    # Generate sources:
+    set(MACHETE_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
+    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
+
+    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
-        ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
+          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
        RESULT_VARIABLE machete_generation_result
        OUTPUT_VARIABLE machete_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
@ -266,26 +360,40 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
      else()
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run machete generate script hash" FORCE)
        message(STATUS "Machete generation completed successfully.")
      endif()
+    else()
+      message(STATUS "Machete generation script has not changed, skipping generation.")
+    endif()

    # Add machete generated sources
    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
-    message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")

-    set_source_files_properties(
-          ${MACHETE_GEN_SOURCES}
-          PROPERTIES
-          COMPILE_FLAGS
-          "-gencode arch=compute_90a,code=sm_90a")
-  endif()
+    # forward compatible
+    set_gencode_flags_for_srcs(
+      SRCS "${MACHETE_GEN_SOURCES}"
+      CUDA_ARCHS "${MACHETE_ARCHS}")

-  # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
-  #  raise an error if the user that this was built with an incompatible 
-  #  CUDA version)
    list(APPEND VLLM_EXT_SRC
      csrc/quantization/machete/machete_pytorch.cu)
+
+    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
+        AND MACHETE_ARCHS)
+      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running w4a16 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building Machete kernels as no compatible archs "
+                     "found in CUDA target architectures")
+    endif()
+  endif()
+# if CUDA endif
 endif()

 message(STATUS "Enabling C extension.")
@ -312,16 +420,36 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)

 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_MOE_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  if (MARLIN_MOE_ARCHS)
+    set(MARLIN_MOE_SRC
        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
        "csrc/moe/marlin_moe_ops.cu")
+
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_MOE_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
+    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+  else()
+    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()
 endif()

 message(STATUS "Enabling moe extension.")
@ -359,6 +487,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
  return()
 endif ()

+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
+
 #
 # Build vLLM flash attention from source
 #
@ -384,8 +523,10 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+          GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
          GIT_PROGRESS TRUE
+          # Don't share the vllm-flash-attn build between build types
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
 endif()

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,56 +1,3 @@
 # Contributing to vLLM

-Thank you for your interest in contributing to vLLM!
-Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
-There are several ways you can contribute to the project:
-
- Identify and report any issues or bugs.
- Request or add a new model.
- Suggest or implement new features.
-
-However, remember that contributions aren't just about code.
-We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
-
-Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
-Talk about it in your blog posts, highlighting how it's driving your incredible projects.
-Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
-
-
-## Setup for development
-
-### Build from source
-
-```bash
-pip install -e .  # This may take several minutes.
-```
-
-### Testing
-
-```bash
-pip install -r requirements-dev.txt
-
-# linting and formatting
-bash format.sh
-# Static type checking
-mypy
-# Unit tests
-pytest tests/
-```
-**Note:** Currently, the repository does not pass the mypy tests.
-
-
-## Contributing Guidelines
-
-### Issue Reporting
-
-If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
-If not, please file a new issue, providing as much relevant information as possible.
-
-### Pull Requests & Code Reviews
-
-Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
-
-### Thank You
-
-Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
-Your contributions make vLLM a great tool for everyone!
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
--- a/34
+++ b/34
@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
--- a/76
+++ b/76
@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

 # Install Python and other dependencies
@ -27,6 +28,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
@ -38,9 +47,14 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-cuda.txt

+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -55,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}

 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
+ARG TARGETPLATFORM

 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@ -62,15 +77,15 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-build.txt

-# files and directories related to build wheels
-COPY csrc csrc
-COPY setup.py setup.py
-COPY cmake cmake
-COPY CMakeLists.txt CMakeLists.txt
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY pyproject.toml pyproject.toml
-COPY vllm vllm
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@ -131,15 +146,18 @@ COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt
-
 #################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt

 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@ -165,16 +183,24 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install dist/*.whl --verbose

 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
-#################### vLLM installation IMAGE ####################
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip uninstall -y torch && \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi

+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
+COPY examples examples
+#################### vLLM installation IMAGE ####################

 #################### TEST IMAGE ####################
 # image to run unit testing suite
@ -187,13 +213,24 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt

+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
 # will not be imported by other tests
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
-
 #################### TEST IMAGE ####################

 #################### OPENAI API SERVER ####################
@ -202,8 +239,11 @@ FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
-
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+    else \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+    fi
 ENV VLLM_USAGE_SOURCE production-docker-image

 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.arm
+++ b/Dockerfile.arm
@ -0,0 +1,62 @@
+# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-arm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
+
+# Set LD_PRELOAD for tcmalloc on ARM
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+FROM cpu-test-arm AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Disabling AVX512 specific optimizations for ARM
+ARG VLLM_CPU_DISABLE_AVX512="true"
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -16,35 +16,23 @@ RUN --mount=type=cache,target=/var/cache/apt \
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp
+    pip install intel-openmp==2025.0.1

 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"

 RUN echo 'ulimit -c 0' >> ~/.bashrc

-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
+RUN pip install intel_extension_for_pytorch==2.5.0

 WORKDIR /workspace

-ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt

-# install oneDNN
-RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
-
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
-    -DONEDNN_BUILD_DOC=OFF \ 
-    -DONEDNN_BUILD_EXAMPLES=OFF \ 
-    -DONEDNN_BUILD_TESTS=OFF \ 
-    -DONEDNN_BUILD_GRAPH=OFF \ 
-    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
-    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
-    cmake --build ./oneDNN/build --target install --config Release
-
 FROM cpu-test-1 AS build

 WORKDIR /workspace/vllm
@ -54,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt

-COPY ./ ./
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
@ -71,4 +62,8 @@ WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@ -0,0 +1,21 @@
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -1,5 +1,6 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"

 FROM $BASE_IMAGE

@ -17,7 +18,7 @@ RUN apt-get update && \
 # When launching the container, mount the code directory to /app
 ARG APP_MOUNT=/app
 VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}
+WORKDIR ${APP_MOUNT}/vllm

 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
@ -25,17 +26,20 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

-COPY . /app/vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

-RUN cd /app/vllm \
-    && python3 -m pip install -U \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+RUN python3 -m pip install -U \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
        -r requirements-neuron.txt

 ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
-    cd /app/vllm \
-    && pip install --no-build-isolation -v -e . \
-    && cd ..
+    pip install --no-build-isolation -v -e .
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils

 CMD ["/bin/bash"]
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -9,23 +9,20 @@ RUN apt-get update -y && \
        ffmpeg libsm6 libxext6 libgl1
 WORKDIR /workspace

-# copy requirements
-COPY requirements-build.txt /workspace/vllm/
-COPY requirements-common.txt /workspace/vllm/
-COPY requirements-openvino.txt /workspace/vllm/
-
-COPY vllm/ /workspace/vllm/vllm
-COPY csrc/core /workspace/vllm/csrc/core
-COPY cmake/utils.cmake /workspace/vllm/cmake/
-COPY CMakeLists.txt /workspace/vllm/
-COPY setup.py /workspace/vllm/
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace

-COPY examples/ /workspace/vllm/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
+COPY examples/ /workspace/examples
+COPY benchmarks/ /workspace/benchmarks
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils

 CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -14,11 +14,14 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p
 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
        torch==2.3.1 \
        -r requirements-cpu.txt \
        xformers uvloop==0.20.0
@ -26,8 +29,11 @@ RUN --mount=type=cache,target=/root/.cache/pip  \
 RUN --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py install

+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -51,9 +51,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
        *"rocm-6.2"*) \
            python3 -m pip uninstall -y torch torchvision \
            && python3 -m pip install --pre \
-                torch==2.6.0.dev20240918 \
-                setuptools-scm>=8 \
-                torchvision==0.20.0.dev20240918 \
+                torch==2.6.0.dev20241113+rocm6.2 \
+                'setuptools-scm>=8' \
+                torchvision==0.20.0.dev20241113+rocm6.2 \
                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
        *) ;; esac

@ -117,6 +117,11 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
 FROM base AS final
 # Import the vLLM development directory from the build context
 COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install --upgrade pip

 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
@ -163,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    if ls libs/*.whl; then \
    python3 -m pip install libs/*.whl; fi

+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,29 +1,28 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20241017"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE
-WORKDIR /workspace
+WORKDIR /workspace/vllm

 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
    git \
    ffmpeg libsm6 libxext6 libgl1

-# Install the TPU and Pallas dependencies.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
 # Build vLLM.
-COPY . /workspace/vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
 ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
-     cd /workspace/vllm && \
    python3 -m pip install \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
        -r requirements-tpu.txt
-RUN cd /workspace/vllm && python3 setup.py develop
+RUN python3 setup.py develop
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils

 CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,4 +1,4 @@
-FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base

 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@ -8,19 +8,62 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
    chmod 644 /usr/share/keyrings/intel-graphics.gpg

 RUN apt-get update -y && \
-    apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
-
-COPY ./ /workspace/vllm
+    apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    ffmpeg \
+    git \
+    libsndfile1 \
+    libsm6 \
+    libxext6 \
+    libgl1 \
+    lsb-release \
+    numactl \
+    python3 \
+    python3-dev \
+    python3-pip \
+    # vim \
+    wget

 WORKDIR /workspace/vllm
+COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
+COPY requirements-common.txt /workspace/vllm/requirements-common.txt

 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+    pip install --no-cache-dir \
    -r requirements-xpu.txt

+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
+COPY . .
+ARG GIT_REPO_CHECK
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+ENV VLLM_TARGET_DEVICE=xpu
+
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=xpu python3 setup.py install
+    python3 setup.py install

 CMD ["/bin/bash"]
+
+FROM vllm-base AS vllm-openai
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+
+ENV VLLM_USAGE_SOURCE production-docker-image \
+    TRITON_XPU_PROFILE 1
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/README.md
+++ b/README.md
@ -10,22 +10,16 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
-
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

-
---
-
-**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
-
-We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
-Join us to learn more about recent advancements of vLLM on MI300X.
-Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
-
 ---

 *Latest News* 🔥
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
@ -51,7 +45,7 @@ vLLM is fast with:
 - Speculative decoding
 - Chunked prefill

-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.

 vLLM is flexible and easy to use with:

@ -107,6 +101,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Google Cloud
 - Lambda Lab
+- Nebius
 - NVIDIA
 - Replicate
 - Roblox
@ -136,5 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 * For technical questions and feature requests, please use Github issues or discussions.
 * For discussing with fellow users, please use Discord.
+* For coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+
+## Media Kit
+
+* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -2,11 +2,10 @@

 ## Reporting a Vulnerability

-If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. 
-We will investigate all legitimate reports and do our best to quickly fix the problem.
+If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.

-Please report security issues using https://github.com/vllm-project/vllm/security/advisories/new
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).

 ---
-Please see PyTorch Security for more information how to securely interact with models: https://github.com/pytorch/pytorch/blob/main/SECURITY.md
-This document mostly references the recommendation from PyTorch, thank you! 
+
+Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -6,3 +6,14 @@ You can download the dataset by running:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
+
+## Downloading the ShareGPT4V dataset
+
+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
+```bash
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
+```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -23,9 +23,10 @@ class RequestFuncInput:
    output_len: int
    model: str
    best_of: int = 1
-    use_beam_search: bool = False
    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False


@dataclass
@ -36,6 +37,7 @@ class RequestFuncOutput:
    ttft: float = 0.0  # Time to first token
    itl: List[float] = field(
        default_factory=list)  # List of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""

@ -48,13 +50,14 @@ async def async_request_tgi(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        params = {
            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            # TGI does not accept ignore_eos flag.
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -79,7 +82,7 @@ async def async_request_tgi(
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@ -119,7 +122,6 @@ async def async_request_trt_llm(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
@ -129,6 +131,8 @@ async def async_request_trt_llm(
            "max_tokens": request_func_input.output_len,
            "stream": True,
        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

@ -143,7 +147,7 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data:")

                        data = json.loads(chunk)
@ -183,7 +187,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
-        assert not request_func_input.use_beam_search

        payload = {
            "prompt": request_func_input.prompt,
@ -231,7 +234,6 @@ async def async_request_openai_completions(
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        payload = {
            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
@ -240,7 +242,10 @@ async def async_request_openai_completions(
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
+            "ignore_eos": request_func_input.ignore_eos,
        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
        }
@ -256,12 +261,13 @@ async def async_request_openai_completions(
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
+                    first_chunk_received = False
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
@ -274,7 +280,8 @@ async def async_request_openai_completions(
                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
-                                if ttft == 0.0:
+                                if not first_chunk_received:
+                                    first_chunk_received = True
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft

@ -285,9 +292,14 @@ async def async_request_openai_completions(

                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
-
-                    output.generated_text = generated_text
+                    if first_chunk_received:
                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
                    output.latency = latency
                else:
                    output.error = response.reason or ""
@ -312,7 +324,6 @@ async def async_request_openai_chat_completions(
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
@ -325,9 +336,12 @@ async def async_request_openai_chat_completions(
                },
            ],
            "temperature": 0.0,
-            "max_tokens": request_func_input.output_len,
+            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
+            "ignore_eos": request_func_input.ignore_eos,
        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
@ -349,7 +363,7 @@ async def async_request_openai_chat_completions(
                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
@ -389,14 +403,6 @@ async def async_request_openai_chat_completions(
    return output


-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
@ -430,4 +436,5 @@ ASYNC_REQUEST_FUNCS = {
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
 }
--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
@ -0,0 +1,494 @@
+"""Benchmark guided decoding throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from typing import List
+
+import datasets
+import pandas as pd
+import uvloop
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str = 'json'
+    completion: str = None
+
+
+def run_vllm(requests: List[SampleRequest],
+             engine_args: EngineArgs,
+             n: int,
+             guided_decoding_rate: float = 1.0,
+             warmup: bool = False) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**vars(engine_args))
+
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    # create a list containing random selected true or false
+    guided_decoding_req_idx = random.sample(
+        range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+    if warmup:
+        print(">>>>> Running warmup prompt, for the first 5")
+        # We setup the first 5 requests to warmup FSM
+        # if using xgrammar dataset, we will skip warmup
+        warmup_requests = requests[:5]
+        for i, request in enumerate(warmup_requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if guided_decoding_rate > 0 else None,
+                ))
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+
+    print(">>>>> Benchmark started...")
+    prompts = []
+    sampling_params = []
+    for i, request in enumerate(requests):
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                guided_decoding=GuidedDecodingParams(
+                    **{request.structure_type: request.schema})
+                if i in guided_decoding_req_idx else None,
+            ))
+
+    start = time.perf_counter()
+    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
+    ret = []
+    for output, request in zip(outputs, requests):
+        generated_text = output.outputs[0].text
+        ret.append({
+            "generated": generated_text,
+            "expected": request.completion
+        })
+    end = time.perf_counter()
+    return end - start, ret
+
+
+async def run_vllm_async(
+        requests: List[SampleRequest],
+        engine_args: AsyncEngineArgs,
+        n: int,
+        guided_decoding_rate: float = 1.0,
+        warmup: bool = False,
+        disable_frontend_multiprocessing: bool = False) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        guided_decoding_req_idx = random.sample(
+            range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+        if warmup:
+            print(">>>>>> Running warmup prompt, for the first 5")
+            # We setup the first 5 requests to warmup FSM
+            # if using xgrammar dataset, we will skip warmup
+            warmup_requests = requests[:5]
+            for i, request in enumerate(warmup_requests):
+                prompts.append(request.prompt)
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=1.0,
+                        top_p=1.0,
+                        ignore_eos=True,
+                        max_tokens=request.expected_output_len,
+                        guided_decoding=GuidedDecodingParams(
+                            json=request.schema)
+                        if guided_decoding_rate > 0 else None,
+                    ))
+            generators = []
+            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+                generator = llm.generate(prompt, sp, request_id=f"test{i}")
+                generators.append(generator)
+            all_gens = merge_async_iterators(*generators)
+            async for i, res in all_gens:
+                pass
+
+        print(">>>>> Benchmark started...")
+        prompts = []
+        sampling_params = []
+        for i, request in enumerate(requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if i in guided_decoding_req_idx else None,
+                ))
+
+        generators = []
+        start_time = []
+        latencies = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+            start_time.append(time.perf_counter())
+            latencies.append([])
+        all_gens = merge_async_iterators(*generators)
+        generated_texts = [''] * len(requests)
+        async for i, res in all_gens:
+            generated_texts[i] = res.outputs[0].text
+            lat = time.perf_counter() - start_time[i]
+            latencies[i].append(lat)
+        ret = [{
+            'generated': gt,
+            'expected': req.completion
+        } for gt, req in zip(generated_texts, requests)]
+        end = time.perf_counter()
+        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
+        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
+                                  for lat in latencies])
+        return end - start, ret, (first_latency, next_latency)
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+
+            ?select_statement: "SELECT " column_list " FROM " table_name
+
+            ?column_list: column_name ("," column_name)*
+
+            ?table_name: identifier
+
+            ?column_name: identifier
+
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        args.warmup = False
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              completion=completion))
+
+    return requests
+
+
+def evaluate(ret, args):
+
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # async engine is working for 'regex', 'choice' and 'grammar'
+    if args.dataset == 'grammar':
+        args.structure_type = 'grammar'
+        args.async_engine = False
+    elif args.dataset == 'regex':
+        args.structure_type = 'regex'
+        args.async_engine = False
+    elif args.dataset == 'choice':
+        args.structure_type = 'choice'
+        args.async_engine = False
+    else:
+        args.structure_type = 'json'
+
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += f"_async{args.async_engine}"
+        result_file_name += f"_warmup{args.warmup}"
+        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    # Synthesize a prompt with the given input length.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = sample_requests(tokenizer, args)
+
+    if args.async_engine:
+        engine_args = AsyncEngineArgs.from_cli_args(args)
+        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
+            run_vllm_async(requests, engine_args, args.n,
+                           args.guided_decoding_ratio, args.warmup,
+                           args.disable_frontend_multiprocessing))
+    else:
+        engine_args = EngineArgs.from_cli_args(args)
+        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
+                                     args.guided_decoding_ratio, args.warmup)
+        first_latency, next_latency = None, None
+
+    score = evaluate(ret, args)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    if first_latency is not None:
+        latency_breakdown = "\nFirst token latency(msecs):\n"
+        latency_breakdown += f"{first_latency.describe()}"
+        latency_breakdown += "\nNext token latency(msecs):\n"
+        latency_breakdown += f"{next_latency.describe()}"
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
+        f"Correct rate is {score} %",
+        f"{latency_breakdown if first_latency is not None else ''}")
+
+    # Output JSON results if specified
+    if args.output_json or result_file_name:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "total_output_tokens": total_output_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
+            "output_tokens_per_second":
+            f"{total_output_tokens / elapsed_time:.2f}",
+            "correct_rate(%)": score
+        }
+        results = {"outputs": ret, **results}
+        if first_latency is not None:
+            results["first_token_latency(msecs)"] = first_latency.describe(
+            ).to_dict()
+            results["next_token_latency(msecs)"] = next_latency.describe(
+            ).to_dict()
+        if args.output_json:
+            with open(args.output_json, "w") as f:
+                json.dump(results, f, indent=4)
+        elif result_file_name:
+            with open(result_file_name, "w") as f:
+                json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=512,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=10,
+                        help="Number of prompts to process.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument("--warmup",
+                        action="store_true",
+                        default=False,
+                        help="Run warmup prompts before benchmark.")
+    parser.add_argument("--save-results",
+                        action="store_true",
+                        default=False,
+                        help="save output results.")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    main(args)
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -1,5 +1,6 @@
 """Benchmark the latency of processing a single batch of requests."""
 import argparse
+import dataclasses
 import json
 import time
 from pathlib import Path
@ -10,50 +11,24 @@ import torch
 from tqdm import tqdm

 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
-from vllm.inputs import PromptInputs
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
 from vllm.utils import FlexibleArgumentParser


 def main(args: argparse.Namespace):
    print(args)

+    engine_args = EngineArgs.from_cli_args(args)
+
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        speculative_model=args.speculative_model,
-        num_speculative_tokens=args.num_speculative_tokens,
-        speculative_draft_tensor_parallel_size=\
-            args.speculative_draft_tensor_parallel_size,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        max_model_len=args.max_model_len,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        quantization_param_path=args.quantization_param_path,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        use_v2_block_manager=args.use_v2_block_manager,
-        enable_chunked_prefill=args.enable_chunked_prefill,
-        download_dir=args.download_dir,
-        block_size=args.block_size,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        load_format=args.load_format,
-        distributed_executor_backend=args.distributed_executor_backend,
-        otlp_traces_endpoint=args.otlp_traces_endpoint,
-        enable_prefix_caching=args.enable_prefix_caching,
-    )
+    llm = LLM(**dataclasses.asdict(engine_args))

    sampling_params = SamplingParams(
        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
+        temperature=1.0,
        top_p=1.0,
-        use_beam_search=args.use_beam_search,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
@ -61,7 +36,7 @@ def main(args: argparse.Namespace):
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_inputs: List[PromptInputs] = [{
+    dummy_prompts: List[PromptType] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]

@ -74,13 +49,13 @@ def main(args: argparse.Namespace):
                    ],
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        str(profile_dir))) as p:
-                llm.generate(dummy_inputs,
+                llm.generate(dummy_prompts,
                             sampling_params=sampling_params,
                             use_tqdm=False)
            print(p.key_averages())
        else:
            start_time = time.perf_counter()
-            llm.generate(dummy_inputs,
+            llm.generate(dummy_prompts,
                         sampling_params=sampling_params,
                         use_tqdm=False)
            end_time = time.perf_counter()
@ -127,19 +102,6 @@ if __name__ == '__main__':
    parser = FlexibleArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
-    parser.add_argument('--speculative-model', type=str, default=None)
-    parser.add_argument('--num-speculative-tokens', type=int, default=None)
-    parser.add_argument('--speculative-draft-tensor-parallel-size',
-                        '-spec-draft-tp',
-                        type=int,
-                        default=None)
-    parser.add_argument('--tokenizer', type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
@ -156,45 +118,6 @@ if __name__ == '__main__':
                        type=int,
                        default=30,
                        help='Number of iterations to run.')
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--enforce-eager',
-                        action='store_true',
-                        help='enforce eager mode and disable CUDA graph')
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
    parser.add_argument(
        '--profile',
        action='store_true',
@ -205,79 +128,12 @@ if __name__ == '__main__':
        default=None,
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument("--device",
-                        type=str,
-                        default="auto",
-                        choices=DEVICE_OPTIONS,
-                        help='device type for vLLM execution')
-    parser.add_argument('--block-size',
-                        type=int,
-                        default=16,
-                        help='block size of key/value cache')
-    parser.add_argument(
-        '--enable-chunked-prefill',
-        action='store_true',
-        help='If True, the prefill requests can be chunked based on the '
-        'max_num_batched_tokens')
-    parser.add_argument("--enable-prefix-caching",
-                        action='store_true',
-                        help="Enable automatic prefix caching")
-    parser.add_argument('--use-v2-block-manager', action='store_true')
-    parser.add_argument(
-        "--ray-workers-use-nsight",
-        action='store_true',
-        help="If specified, use nsight to profile ray workers",
-    )
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the latency results in JSON format.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--otlp-traces-endpoint',
-        type=str,
-        default=None,
-        help='Target URL to which OpenTelemetry traces will be sent.')
+
+    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -25,6 +25,7 @@ ShareGPT example usage:
        --input-length-range 128:256
 """

+import dataclasses
 import json
 import random
 import time
@ -33,6 +34,7 @@ from typing import List, Optional, Tuple
 from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser

 try:
@ -52,13 +54,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
    print(f"cost time {end_time - start_time}")


-def sample_requests(
+@dataclasses.dataclass
+class Request:
+    prompt: str
+    prompt_len: int
+    output_len: int
+
+
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
+    vocab = tokenizer.get_vocab()
+    # Remove the special tokens.
+    vocab = {
+        k: v
+        for k, v in vocab.items() if k not in tokenizer.all_special_ids
+    }
+    return random.choices(list(vocab.values()), k=length)
+
+
+def sample_requests_from_dataset(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: Tuple[int, int],
    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> List[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")

@ -75,31 +94,55 @@ def sample_requests(
    random.shuffle(dataset)

    min_len, max_len = input_length_range
+    assert min_len >= 0 and max_len >= min_len, "input_length_range too small"

    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_requests: List[Request] = []
+
    for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
+        if len(filtered_requests) == num_requests:
            break

        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
+        prompt_token_ids = tokenizer(dataset[i][0]).input_ids
+        prompt = tokenizer.decode(prompt_token_ids)
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
+        output_len = (len(completion_token_ids)
+                      if fixed_output_len is None else fixed_output_len)
        if min_len <= prompt_len <= max_len:
-            filtered_dataset.append((prompt, prompt_len, output_len))
+            filtered_requests.append(Request(prompt, prompt_len, output_len))

-    return filtered_dataset
+    return filtered_requests


-def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
+def sample_requests_from_random(
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: Tuple[int, int],
+    fixed_output_len: Optional[int],
+    prefix_len: int,
+) -> List[Request]:
+
+    requests = []
+    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
+    min_len, max_len = input_length_range
+
+    for i in range(num_requests):
+        unique_part_token_ids = sample_tokens(
+            tokenizer,
+            random.randint(min_len - prefix_len, max_len - prefix_len))
+        prompt_token_ids = prefix_token_ids + unique_part_token_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        prompt_len = len(prompt_token_ids)
+        assert (min_len <= prompt_len <= max_len
+                ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        requests.append(Request(prompt, prompt_len, fixed_output_len))
+    return requests
+
+
+def repeat_and_sort_requests(requests: List[Request],
                             repeat_count: int,
                             sort: bool = False) -> List[str]:
    repeated_requests = requests * repeat_count
@ -107,17 +150,20 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
        repeated_requests.sort(key=lambda x: x[1])
    else:
        random.shuffle(repeated_requests)
-    return [req[0] for req in repeated_requests]
+    return [req.prompt for req in repeated_requests]


 def main(args):
    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
    input_length_range = tuple(map(int, args.input_length_range.split(':')))
-
+    random.seed(args.seed)
    if args.dataset_path is not None:
+        if args.prefix_len > 0:
+            raise ValueError("prefix-len is not supported when "
+                             "dataset-path is provided.")
        print(f"Start to sample {args.num_prompts} prompts "
-              "from {args.dataset_path}")
-        filtered_datasets = sample_requests(
+              f"from {args.dataset_path}")
+        filtered_requests = sample_requests_from_dataset(
            dataset_path=args.dataset_path,
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
@ -125,32 +171,34 @@ def main(args):
            fixed_output_len=args.output_len,
        )
    else:
-        prompt_len = len(tokenizer(PROMPT).input_ids)
-        filtered_datasets = [(PROMPT, prompt_len, args.output_len)
-                             ] * args.num_prompts
+        print(f"Start to sample {args.num_prompts} prompts from random")
+        filtered_requests = sample_requests_from_random(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+            prefix_len=args.prefix_len,
+        )

-    llm = LLM(model=args.model,
-              tokenizer_mode='auto',
-              trust_remote_code=True,
-              enforce_eager=True,
-              use_v2_block_manager=args.use_v2_block_manager,
-              tensor_parallel_size=args.tensor_parallel_size,
-              enable_prefix_caching=args.enable_prefix_caching)
+    # Print some helpful stats of the requests.
+    print(f"Sampled {len(filtered_requests)} requests.")
+    prompt_lens = [req.prompt_len for req in filtered_requests]
+    print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
+    print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
+    print(f"Min Prompt Length: {min(prompt_lens)}")
+    print(f"Max Prompt Length: {max(prompt_lens)}")
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    llm = LLM(**dataclasses.asdict(engine_args))

    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)

-    print("Testing filtered datasets")
-    prompts = repeat_and_sort_requests(filtered_datasets,
+    print("Testing filtered requests")
+    prompts = repeat_and_sort_requests(filtered_requests,
                                       repeat_count=args.repeat_count,
                                       sort=args.sort)

-    print("------warm up------")
-    test_prefix(
-        llm=llm,
-        prompts=prompts,
-        sampling_params=sampling_params,
-    )
-
    print("------start generating------")
    test_prefix(
        llm=llm,
@ -163,36 +211,37 @@ if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description=
        'Benchmark the performance with or without automatic prefix caching.')
-    parser.add_argument('--model',
-                        type=str,
-                        default='baichuan-inc/Baichuan2-13B-Chat')
    parser.add_argument("--dataset-path",
                        type=str,
                        default=None,
                        help="Path to the dataset.")
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--output-len', type=int, default=10)
-    parser.add_argument('--enable-prefix-caching',
-                        action='store_true',
-                        help='enable prefix caching')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        help='Use BlockSpaceMangerV2')
    parser.add_argument('--num-prompts',
                        type=int,
-                        default=1,
+                        required=True,
                        help="Number of the prompts sampled from dataset")
    parser.add_argument('--repeat-count',
                        type=int,
-                        default=100,
+                        default=1,
                        help='Number of times to repeat each prompt')
    parser.add_argument('--sort',
                        action='store_true',
                        help='Sort prompts by input length')
    parser.add_argument('--input-length-range',
                        type=str,
-                        default='128:256',
+                        required=True,
                        help='Range of input lengths for sampling prompts,'
                        'specified as "min:max" (e.g., "128:256").')
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Specifies the length of a common prefix to be "
+        "added to the input prompt. The input-length-range will "
+        "subtract this length when filtering prompts. Only used "
+        "when dataset-path is not provided.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -1,5 +1,6 @@
 """Benchmark offline prioritization."""
 import argparse
+import dataclasses
 import json
 import random
 import time
@ -7,7 +8,8 @@ from typing import List, Optional, Tuple

 from transformers import AutoTokenizer, PreTrainedTokenizerBase

-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser


 def sample_requests(
@ -62,47 +64,11 @@ def sample_requests(

 def run_vllm(
    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
    n: int,
-    use_beam_search: bool,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    quantization_param_path: Optional[str],
-    device: str,
-    enable_prefix_caching: bool,
-    enable_chunked_prefill: bool,
-    max_num_batched_tokens: int,
-    gpu_memory_utilization: float = 0.9,
-    download_dir: Optional[str] = None,
+    engine_args: EngineArgs,
 ) -> float:
    from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        quantization_param_path=quantization_param_path,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching,
-        download_dir=download_dir,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        disable_log_stats=False,
-    )
+    llm = LLM(**dataclasses.asdict(engine_args))

    # Add the requests to the engine.
    prompts = []
@ -114,9 +80,8 @@ def run_vllm(
        sampling_params.append(
            SamplingParams(
                n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                top_p=1.0,
-                use_beam_search=use_beam_search,
                ignore_eos=True,
                max_tokens=output_len,
            ))
@ -144,15 +109,8 @@ def main(args: argparse.Namespace):
                                   args.output_len)

    if args.backend == "vllm":
-        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
-            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.download_dir)
+        elapsed_time = run_vllm(requests, args.n,
+                                EngineArgs.from_cli_args(args))
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
    total_num_tokens = sum(prompt_len + output_len
@ -174,7 +132,7 @@ def main(args: argparse.Namespace):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
                        choices=["vllm", "hf", "mii"],
@ -192,97 +150,21 @@ if __name__ == "__main__":
                        default=None,
                        help="Output length for each request. Overrides the "
                        "output length from the dataset.")
-    parser.add_argument("--model", type=str, default="facebook/opt-125m")
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--n",
                        type=int,
                        default=1,
                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=200,
                        help="Number of prompts to process.")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument("--enforce-eager",
-                        action="store_true",
-                        help="enforce eager execution")
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
-    parser.add_argument(
-        "--enable-prefix-caching",
-        action='store_true',
-        help="enable automatic prefix caching for vLLM backend.")
-    parser.add_argument("--enable-chunked-prefill",
-                        action='store_true',
-                        help="enable chunked prefill for vLLM backend.")
-    parser.add_argument('--max-num-batched-tokens',
-                        type=int,
-                        default=None,
-                        help='maximum number of batched tokens per '
-                        'iteration')
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')

+    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,4 +1,4 @@
-"""Benchmark online serving throughput.
+r"""Benchmark online serving throughput.

 On the server side, run one of the following commands:
    vLLM OpenAI API server
@ -53,6 +53,8 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+

@dataclass
 class BenchmarkMetrics:
@ -60,6 +62,7 @@ class BenchmarkMetrics:
    total_input: int
    total_output: int
    request_throughput: float
+    request_goodput: float
    output_throughput: float
    total_token_throughput: float
    mean_ttft_ms: float
@ -89,10 +92,8 @@ def sample_sharegpt_requests(
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int, None]]:
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
    # Load the dataset.
-    with open(dataset_path) as f:
+    with open(dataset_path, encoding='utf-8') as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
@ -117,7 +118,7 @@ def sample_sharegpt_requests(
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
+        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
            # Prune too short sequences.
            continue
        if prompt_len > 1024 or prompt_len + output_len > 2048:
@ -141,7 +142,7 @@ def sample_sonnet_requests(
    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."

    # Load the dataset.
-    with open(dataset_path) as f:
+    with open(dataset_path, encoding='utf-8') as f:
        poem_lines = f.readlines()

    # Tokenize the poem lines.
@ -178,9 +179,9 @@ def sample_sonnet_requests(
    # Sample the rest of lines per request.
    sampled_requests: List[Tuple[str, int, int]] = []
    for _ in range(num_requests):
-        sampled_lines = "".join(
-            prefix_lines +
-            random.sample(poem_lines, num_input_lines - num_prefix_lines))
+        num_lines_needed = num_input_lines - num_prefix_lines
+        sampled_lines = "".join(prefix_lines +
+                                random.choices(poem_lines, k=num_lines_needed))

        prompt = f"{base_prompt}{sampled_lines}"
        message = [
@ -198,22 +199,88 @@ def sample_sonnet_requests(
    return sampled_requests


+def sample_mmmu_pro_vision_requests(
+    dataset,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    sampled_requests: List[Tuple[str, int, int, Dict[str,
+                                                     Collection[str]]]] = []
+    for data in dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        # MMMU-Pro vision direct prompt
+        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
+        prompt = (
+            "Answer with the option letter from the given choices directly. "
+            "The last line of your response should be of the following "
+            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
+            "options.")
+
+        prompt_token_ids = tokenizer(prompt).input_ids
+        if fixed_output_len is None:
+            # Default max output len is set to 128
+            print("--hf-output-len is not provided. Using default value 128.")
+            fixed_output_len = 128
+
+        prompt_len = len(prompt_token_ids)
+        output_len = fixed_output_len
+
+        assert isinstance(
+            data["image"],
+            Image), ("Input image format must be `PIL.Image.Image`, "
+                     f"given {type(data['image'])}.")
+        image: Image = data["image"]
+        image = image.convert("RGB")
+        image_data = io.BytesIO()
+        image.save(image_data, format='JPEG')
+        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        mm_content = {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+
+    return sampled_requests
+
+
 def sample_hf_requests(
    dataset_path: str,
    dataset_subset: str,
    dataset_split: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
+    random_seed: int,
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+
+    # Special case for MMMU-Pro vision dataset
+    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
+        assert dataset_split == "test"
+        dataset = load_dataset(dataset_path,
+                               name=dataset_subset,
+                               split=dataset_split,
+                               streaming=True)
+        assert "image" in dataset.features, (
+            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
+        filter_func = lambda x: isinstance(x["image"], Image)
+        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
+        return sample_mmmu_pro_vision_requests(dataset, num_requests,
+                                               tokenizer, fixed_output_len)
+
    dataset = load_dataset(dataset_path,
                           name=dataset_subset,
                           split=dataset_split,
                           streaming=True)
    assert "conversations" in dataset.features, (
        "HF Dataset must have 'conversations' column.")
-    filtered_dataset = dataset.shuffle().filter(
-        lambda x: len(x["conversations"]) >= 2)
+    filter_func = lambda x: len(x["conversations"]) >= 2
+    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
    sampled_requests: List[Tuple[str, int, int, Dict[str,
                                                     Collection[str]]]] = []
    for data in filtered_dataset:
@ -228,10 +295,11 @@ def sample_hf_requests(
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
+        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
            # Prune too short sequences.
            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
+        if fixed_output_len is None and \
+            (prompt_len > 1024 or prompt_len + output_len > 2048):
            # Prune too long sequences.
            continue

@ -248,6 +316,19 @@ def sample_hf_requests(
                    "url": f"data:image/jpeg;base64,{image_base64}"
                },
            }
+        elif "image" in data and isinstance(data["image"], str):
+            if (data["image"].startswith("http://") or \
+                data["image"].startswith("file://")):
+                image_url = data["image"]
+            else:
+                image_url = f"file://{data['image']}"
+
+            mm_content = {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            }
        else:
            mm_content = None

@ -294,8 +375,33 @@ def sample_random_requests(
 async def get_request(
    input_requests: List[Tuple[str, int, int]],
    request_rate: float,
+    burstiness: float = 1.0,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
+    """
+    Asynchronously generates requests at a specified rate 
+    with OPTIONAL burstiness.
+    
+    Args:
+        input_requests: 
+            A list of input requests, each represented as a tuple.
+        request_rate: 
+            The rate at which requests are generated (requests/s).
+        burstiness (optional): 
+            The burstiness factor of the request generation. 
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results 
+            in more bursty requests, while a higher burstiness value 
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
    for request in input_requests:
        yield request

@ -303,8 +409,9 @@ async def get_request(
            # If the request rate is infinity, then we don't need to wait.
            continue

-        # Sample the request interval from the exponential distribution.
-        interval = np.random.exponential(1.0 / request_rate)
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
        # The next request will be sent after the interval.
        await asyncio.sleep(interval)

@ -316,12 +423,15 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: List[str],
    selected_percentiles: List[float],
+    gootput_config_dict: Dict[str, float],
 ) -> Tuple[BenchmarkMetrics, List[int]]:
    actual_output_lens: List[int] = []
    total_input = 0
    completed = 0
+    good_completed = 0
    itls: List[float] = []
    tpots: List[float] = []
+    all_tpots: List[float] = []
    ttfts: List[float] = []
    e2els: List[float] = []
    for i in range(len(outputs)):
@ -335,9 +445,13 @@ def calculate_metrics(
                          add_special_tokens=False).input_ids)
            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
+            tpot = 0
            if output_len > 1:
-                tpots.append(
-                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+                                                                 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
            itls += outputs[i].itl
            ttfts.append(outputs[i].ttft)
            e2els.append(outputs[i].latency)
@ -345,6 +459,28 @@ def calculate_metrics(
        else:
            actual_output_lens.append(0)

+    if gootput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in gootput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(gootput_config_dict["ttft"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "tpot" in gootput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(gootput_config_dict["tpot"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "e2el" in gootput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(gootput_config_dict["e2el"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
    if completed == 0:
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
@ -355,6 +491,7 @@ def calculate_metrics(
        total_input=total_input,
        total_output=sum(actual_output_lens),
        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
        mean_ttft_ms=np.mean(ttfts or 0) *
@ -373,9 +510,9 @@ def calculate_metrics(
        median_itl_ms=np.median(itls or 0) * 1000,
        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
                            for p in selected_percentiles],
-        mean_e2el_ms=np.median(e2els or 0) * 1000,
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
-        median_e2el_ms=np.mean(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
                             for p in selected_percentiles],
    )
@ -392,12 +529,15 @@ async def benchmark(
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
    best_of: int,
-    use_beam_search: bool,
    request_rate: float,
+    burstiness: float,
    disable_tqdm: bool,
    profile: bool,
    selected_percentile_metrics: List[str],
    selected_percentiles: List[str],
+    ignore_eos: bool,
+    gootput_config_dict: Dict[str, float],
+    max_concurrency: Optional[int],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -419,8 +559,8 @@ async def benchmark(
        output_len=test_output_len,
        logprobs=logprobs,
        best_of=best_of,
-        use_beam_search=use_beam_search,
        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
    )
    test_output = await request_func(request_func_input=test_input)
    if not test_output.success:
@ -432,43 +572,61 @@ async def benchmark(

    if profile:
        print("Starting profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
+        profile_input = RequestFuncInput(model=model_id,
                                         prompt=test_prompt,
                                         api_url=base_url + "/start_profile",
                                         prompt_len=test_prompt_len,
                                         output_len=test_output_len,
                                         logprobs=logprobs,
                                         best_of=best_of,
-            use_beam_search=use_beam_search,
                                         multi_modal_content=test_mm_content,
-        )
+                                         ignore_eos=ignore_eos)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")

+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = (asyncio.Semaphore(max_concurrency)
+                 if max_concurrency else None)
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+
    benchmark_start_time = time.perf_counter()
    tasks: List[asyncio.Task] = []
-    async for request in get_request(input_requests, request_rate):
+    async for request in get_request(input_requests, request_rate, burstiness):
        prompt, prompt_len, output_len, mm_content = request
-        request_func_input = RequestFuncInput(
-            model=model_id,
+        request_func_input = RequestFuncInput(model=model_id,
                                              prompt=prompt,
                                              api_url=api_url,
                                              prompt_len=prompt_len,
                                              output_len=output_len,
                                              logprobs=logprobs,
                                              best_of=best_of,
-            use_beam_search=use_beam_search,
                                              multi_modal_content=mm_content,
-        )
+                                              ignore_eos=ignore_eos)
        tasks.append(
            asyncio.create_task(
-                request_func(request_func_input=request_func_input,
+                limited_request_func(request_func_input=request_func_input,
                                     pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

@ -482,7 +640,6 @@ async def benchmark(
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
-            use_beam_search=use_beam_search,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
@ -500,6 +657,7 @@ async def benchmark(
        tokenizer=tokenizer,
        selected_percentile_metrics=selected_percentile_metrics,
        selected_percentiles=selected_percentiles,
+        gootput_config_dict=gootput_config_dict,
    )

    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@ -511,6 +669,9 @@ async def benchmark(
                                 metrics.total_output))
    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                    metrics.request_throughput))
+    if gootput_config_dict:
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+                                        metrics.request_goodput))
    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                    metrics.output_throughput))
    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@ -522,6 +683,8 @@ async def benchmark(
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
        "request_throughput": metrics.request_throughput,
+        "request_goodput:":
+        metrics.request_goodput if gootput_config_dict else None,
        "output_throughput": metrics.output_throughput,
        "total_token_throughput": metrics.total_token_throughput,
        "input_lens": [output.prompt_len for output in outputs],
@ -540,7 +703,7 @@ async def benchmark(
        # E.g., "Time to First Token"
        metric_header: str,
    ):
-        # This function print and add statistics of the specified
+        # This function prints and adds statistics of the specified
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
@ -575,6 +738,41 @@ async def benchmark(
    return result


+def check_goodput_args(args):
+    # Check and parse goodput arguments
+    gootput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        gootput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in gootput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. ")
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative.")
+    return gootput_config_dict
+
+
+def parse_goodput(slo_pairs):
+    gootput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            gootput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds.") from err
+    return gootput_config_dict
+
+
 def main(args: argparse.Namespace):
    print(args)
    random.seed(args.seed)
@ -583,6 +781,7 @@ def main(args: argparse.Namespace):
    backend = args.backend
    model_id = args.model
    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode

    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
@ -592,6 +791,7 @@ def main(args: argparse.Namespace):
        base_url = f"http://{args.host}:{args.port}"

    tokenizer = get_tokenizer(tokenizer_id,
+                              tokenizer_mode=tokenizer_mode,
                              trust_remote_code=args.trust_remote_code)

    if args.dataset is not None:
@ -652,6 +852,7 @@ def main(args: argparse.Namespace):
            dataset_split=args.hf_split,
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
+            random_seed=args.seed,
            fixed_output_len=args.hf_output_len,
        )

@ -668,6 +869,8 @@ def main(args: argparse.Namespace):
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")

+    gootput_config_dict = check_goodput_args(args)
+
    benchmark_result = asyncio.run(
        benchmark(
            backend=backend,
@ -678,14 +881,17 @@ def main(args: argparse.Namespace):
            input_requests=input_requests,
            logprobs=args.logprobs,
            best_of=args.best_of,
-            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
+            burstiness=args.burstiness,
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
            selected_percentile_metrics=args.percentile_metrics.split(","),
            selected_percentiles=[
                float(p) for p in args.metric_percentiles.split(",")
            ],
+            ignore_eos=args.ignore_eos,
+            gootput_config_dict=gootput_config_dict,
+            max_concurrency=args.max_concurrency,
        ))

    # Save config and results to json
@ -699,7 +905,6 @@ def main(args: argparse.Namespace):
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
-        result_json["use_beam_search"] = args.use_beam_search
        result_json["num_prompts"] = args.num_prompts

        # Metadata
@ -716,18 +921,22 @@ def main(args: argparse.Namespace):
        # Traffic
        result_json["request_rate"] = (
            args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency

        # Merge with benchmark result
        result_json = {**result_json, **benchmark_result}

        # Save to file
        base_model_id = model_id.split("/")[-1]
-        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+                               if args.max_concurrency is not None else "")
+        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  #noqa
        if args.result_filename:
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w") as outfile:
+        with open(file_name, "w", encoding='utf-8') as outfile:
            json.dump(result_json, outfile)


@ -773,6 +982,19 @@ if __name__ == "__main__":
                        default=None,
                        help="Path to the sharegpt/sonnet dataset. "
                        "Or the huggingface dataset ID if using HF dataset.")
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.")
+
    parser.add_argument(
        "--model",
        type=str,
@ -815,8 +1037,20 @@ if __name__ == "__main__":
        default=float("inf"),
        help="Number of requests per second. If this is inf, "
        "then all the requests are sent at time 0. "
-        "Otherwise, we use Poisson process to synthesize "
-        "the request arrival times.",
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
    )
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument(
@ -864,6 +1098,11 @@ if __name__ == "__main__":
        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
        " format.",
    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
    parser.add_argument(
        "--percentile-metrics",
        type=str,
@ -881,6 +1120,17 @@ if __name__ == "__main__":
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        "pairs, where the key is a metric name, and the value is in "
+        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        "separated by spaces. Allowed request level metric names are "
+        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")

    # group for dataset specific arguments
    sonnet_group = parser.add_argument_group("sonnet dataset options")
@ -962,5 +1212,15 @@ if __name__ == "__main__":
        "from the sampled HF dataset.",
    )

+    parser.add_argument(
+        '--tokenizer-mode',
+        type=str,
+        default="auto",
+        choices=['auto', 'slow', 'mistral'],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        'always use the slow tokenizer. \n* '
+        '"mistral" will always use the `mistral_common` tokenizer.')
+
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@ -0,0 +1,881 @@
+r"""Benchmark online serving throughput with guided decoding.
+
+On the server side, run one of the following commands:
+    (vLLM OpenAI API server)
+    vllm serve <your_model> --disable-log-requests
+
+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset json \
+        --guided-decoding-ratio 1.0 \
+        --guided-decoding-backend xgrammar \
+        --request-rate 10 \
+        --num-prompts 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+import argparse
+import asyncio
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from typing import AsyncGenerator, List, Optional, Tuple
+
+import datasets
+import numpy as np
+import pandas as pd
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  RequestFuncOutput)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: List[Tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: List[Tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: List[Tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: List[Tuple[float, float]]
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str
+    completion: str = None
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+
+            ?select_statement: "SELECT " column_list " FROM " table_name
+
+            ?column_list: column_name ("," column_name)*
+
+            ?table_name: identifier
+
+            ?column_name: identifier
+
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              structure_type=args.structure_type,
+                              completion=completion))
+
+    return requests
+
+
+async def get_request(
+    input_requests: List[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
+    """
+    Asynchronously generates requests at a specified rate 
+    with OPTIONAL burstiness.
+    
+    Args:
+        input_requests: 
+            A list of input requests, each represented as a tuple.
+        request_rate: 
+            The rate at which requests are generated (requests/s).
+        burstiness (optional): 
+            The burstiness factor of the request generation. 
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results 
+            in more bursty requests, while a higher burstiness value 
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
+    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
+    for i, request in enumerate(input_requests):
+        yield i, request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[float],
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    actual_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    all_tpots: List[float] = []
+    ttfts: List[float] = []
+    e2els: List[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note : this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text,
+                          add_special_tokens=False).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+                                                                 1)
+                tpots.append(tpot)
+            outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+                            for p in selected_percentiles],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+                             for p in selected_percentiles],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[SampleRequest],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[str],
+    ignore_eos: bool,
+    max_concurrency: Optional[int],
+    guided_decoding_ratio: float,
+    guided_decoding_backend: str,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    def prepare_extra_body(request) -> dict:
+        extra_body = {}
+        # Add the schema to the extra_body
+        extra_body[request.structure_type] = request.schema
+        # Add the specific guided_decoding_backend
+        extra_body["guided_decoding_backend"] = guided_decoding_backend
+        return extra_body
+
+    print("Starting initial single prompt test run...")
+    guided_decoding_req_idx = random.sample(
+        range(len(input_requests)),
+        int(len(input_requests) * guided_decoding_ratio))
+
+    test_request = input_requests[0]
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_request.prompt,
+        api_url=api_url,
+        prompt_len=test_request.prompt_len,
+        output_len=test_request.expected_output_len,
+        ignore_eos=ignore_eos,
+        extra_body=prepare_extra_body(test_request),
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}")
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=prepare_extra_body(test_request),
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
+    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = (asyncio.Semaphore(max_concurrency)
+                 if max_concurrency else None)
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    expected: List[str] = []
+    async for i, request in get_request(input_requests, request_rate,
+                                        burstiness):
+        extra_body = prepare_extra_body(
+            request) if i in guided_decoding_req_idx else None
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=request.prompt,
+            api_url=api_url,
+            prompt_len=request.prompt_len,
+            output_len=request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        expected.append(request.completion)
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input,
+                                     pbar=pbar)))
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            extra_body={test_request.structure_type: test_request.schema},
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+    )
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+                                    metrics.total_token_throughput))
+
+    result = {
+        "duration":
+        benchmark_duration,
+        "completed":
+        metrics.completed,
+        "total_input_tokens":
+        metrics.total_input,
+        "total_output_tokens":
+        metrics.total_output,
+        "request_throughput":
+        metrics.request_throughput,
+        "output_throughput":
+        metrics.output_throughput,
+        "total_token_throughput":
+        metrics.total_token_throughput,
+        "ttft_description":
+        pd.Series([output.ttft for output in outputs]).describe().to_dict(),
+        "tpot_description":
+        pd.Series([output.tpot for output in outputs]).describe().to_dict(),
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens":
+        actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    ret = [{
+        'generated': output.generated_text,
+        'expected': gt
+    } for output, gt in zip(outputs, expected)]
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{:<40} {:<10.2f}".format(
+            f"Mean {metric_name} (ms):",
+            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+        print("{:<40} {:<10.2f}".format(
+            f"Median {metric_name} (ms):",
+            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics,
+                                f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+                                            value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT",
+                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result, ret
+
+
+def evaluate(ret, args):
+
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'guided_json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'guided_regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'guided_choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)
+
+    if args.dataset == 'grammar':
+        args.structure_type = 'guided_grammar'
+    elif args.dataset == 'regex':
+        args.structure_type = 'guided_regex'
+    elif args.dataset == 'choice':
+        args.structure_type = 'guided_choice'
+    else:
+        args.structure_type = 'guided_json'
+
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{backend}"
+        result_file_name += f"_{args.request_rate}qps"
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    input_requests = sample_requests(tokenizer, args)
+
+    benchmark_result, ret = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[
+                float(p) for p in args.metric_percentiles.split(",")
+            ],
+            ignore_eos=args.ignore_eos,
+            max_concurrency=args.max_concurrency,
+            guided_decoding_ratio=args.guided_decoding_ratio,
+            guided_decoding_backend=args.guided_decoding_backend,
+        ))
+
+    # Save config and results to json
+    score = evaluate(ret, args)
+    print("correct_rate(%)", score, '\n')
+    if args.save_results:
+        results = {
+            "backend":
+            backend,
+            "model_id":
+            model_id,
+            "tokenizer_id":
+            tokenizer_id,
+            "num_prompts":
+            args.num_prompts,
+            "request_rate":
+            args.request_rate if args.request_rate < float("inf") else "inf",
+            "burstiness":
+            args.burstiness,
+            "max_concurrency":
+            args.max_concurrency,
+            "correct_rate(%)":
+            score
+        }
+        results = {"outputs": ret, **results, **benchmark_result}
+
+        # Save to file
+        if args.result_filename:
+            result_file_name = args.result_filename
+        if args.result_dir:
+            result_file_name = os.path.join(args.result_dir, result_file_name)
+        with open(result_file_name, "w", encoding='utf-8') as outfile:
+            json.dump(results, outfile, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-results",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-seperated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        "Default value is \"ttft,tpot,itl\".")
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-seperated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\". "
+        "Use \"--percentile-metrics\" to select metrics.",
+    )
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--guided-decoding-backend",
+                        type=str,
+                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
+                        default="xgrammar",
+                        help="Backend to use for guided decoding")
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,29 +1,71 @@
 """Benchmark offline inference throughput."""
 import argparse
+import dataclasses
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import List, Optional

 import torch
 import uvloop
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

-from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.inputs import TextPrompt
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators


-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")

@ -32,23 +74,36 @@ def sample_requests(
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
    # Shuffle the dataset.
    random.shuffle(dataset)

    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
+    filtered_dataset: List[SampleRequest] = []
+    for data in dataset:
        if len(filtered_dataset) == num_requests:
            break

+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
@ -59,172 +114,88 @@ def sample_requests(
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append(
+            SampleRequest(prompt=prompt,
+                          prompt_len=prompt_len,
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data))

    return filtered_dataset


 def run_vllm(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
+    requests: List[SampleRequest],
    n: int,
-    use_beam_search: bool,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    quantization_param_path: Optional[str],
-    device: str,
-    enable_prefix_caching: bool,
-    enable_chunked_prefill: bool,
-    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
-    gpu_memory_utilization: float = 0.9,
-    num_scheduler_steps: int = 1,
-    use_v2_block_manager: bool = False,
-    download_dir: Optional[str] = None,
-    load_format: str = EngineArgs.load_format,
-    disable_async_output_proc: bool = False,
-    use_new_beam_search_impl: bool = False,
+    engine_args: EngineArgs,
 ) -> float:
    from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        quantization_param_path=quantization_param_path,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching,
-        download_dir=download_dir,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-        load_format=load_format,
-        num_scheduler_steps=num_scheduler_steps,
-        use_v2_block_manager=use_v2_block_manager,
-        disable_async_output_proc=disable_async_output_proc,
-    )
+    llm = LLM(**dataclasses.asdict(engine_args))

    # Add the requests to the engine.
-    prompts: List[str] = []
+    prompts: List[TextPrompt] = []
    sampling_params: List[SamplingParams] = []
-    for prompt, _, output_len in requests:
-        prompts.append(prompt)
+    for request in requests:
+        prompts.append(
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
        sampling_params.append(
            SamplingParams(
                n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                top_p=1.0,
-                use_beam_search=use_beam_search,
                ignore_eos=True,
-                max_tokens=output_len,
+                max_tokens=request.expected_output_len,
            ))

-    if not use_new_beam_search_impl:
+    use_beam_search = False
+
+    if not use_beam_search:
        start = time.perf_counter()
        llm.generate(prompts, sampling_params, use_tqdm=True)
        end = time.perf_counter()
    else:
-        assert use_beam_search
-        prompts = [prompt for prompt, _, _ in requests]
+        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
        output_len = requests[0][2]
-        for prompt, input_len, _output_len in requests:
-            assert _output_len == output_len
+        for request in requests:
+            assert request.expected_output_len == output_len
        start = time.perf_counter()
-        llm.beam_search(prompts,
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
                beam_width=n,
                max_tokens=output_len,
-                        ignore_eos=True)
+                ignore_eos=True,
+            ))
        end = time.perf_counter()
    return end - start


 async def run_vllm_async(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
+    requests: List[SampleRequest],
    n: int,
-    use_beam_search: bool,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    quantization_param_path: Optional[str],
-    device: str,
-    enable_prefix_caching: bool,
-    enable_chunked_prefill: bool,
-    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
-    gpu_memory_utilization: float = 0.9,
-    num_scheduler_steps: int = 1,
-    use_v2_block_manager: bool = False,
-    download_dir: Optional[str] = None,
-    load_format: str = EngineArgs.load_format,
-    disable_async_output_proc: bool = False,
+    engine_args: AsyncEngineArgs,
    disable_frontend_multiprocessing: bool = False,
 ) -> float:
    from vllm import SamplingParams
-    engine_args = AsyncEngineArgs(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        quantization_param_path=quantization_param_path,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching,
-        download_dir=download_dir,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-        load_format=load_format,
-        num_scheduler_steps=num_scheduler_steps,
-        use_v2_block_manager=use_v2_block_manager,
-        disable_async_output_proc=disable_async_output_proc,
-        worker_use_ray=False,
-        disable_log_requests=True,
-    )

    async with build_async_engine_client_from_engine_args(
            engine_args, disable_frontend_multiprocessing) as llm:

        # Add the requests to the engine.
-        prompts: List[str] = []
+        prompts: List[TextPrompt] = []
        sampling_params: List[SamplingParams] = []
-        for prompt, _, output_len in requests:
-            prompts.append(prompt)
+        for request in requests:
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
            sampling_params.append(
                SamplingParams(
                    n=n,
-                    temperature=0.0 if use_beam_search else 1.0,
+                    temperature=1.0,
                    top_p=1.0,
-                    use_beam_search=use_beam_search,
                    ignore_eos=True,
-                    max_tokens=output_len,
+                    max_tokens=request.expected_output_len,
                ))

        generators = []
@ -240,15 +211,13 @@ async def run_vllm_async(


 def run_hf(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
    model: str,
    tokenizer: PreTrainedTokenizerBase,
    n: int,
-    use_beam_search: bool,
    max_batch_size: int,
    trust_remote_code: bool,
 ) -> float:
-    assert not use_beam_search
    llm = AutoModelForCausalLM.from_pretrained(
        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
    if llm.config.model_type == "llama":
@ -280,7 +249,7 @@ def run_hf(
                              padding=True).input_ids
        llm_outputs = llm.generate(
            input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
+            do_sample=True,
            num_return_sequences=n,
            temperature=1.0,
            top_p=1.0,
@ -300,14 +269,14 @@ def run_hf(


 def run_mii(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
    model: str,
    tensor_parallel_size: int,
    output_len: int,
 ) -> float:
    from mii import client, serve
    llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [prompt for prompt, _, _ in requests]
+    prompts = [request.prompt for request in requests]

    start = time.perf_counter()
    llm.generate(prompts, max_new_tokens=output_len)
@ -325,47 +294,74 @@ def main(args: argparse.Namespace):
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer, trust_remote_code=args.trust_remote_code)
    if args.dataset is None:
+        vocab_size = tokenizer.vocab_size
+        requests = []
+        for _ in range(args.num_prompts):
            # Synthesize a prompt with the given input length.
-        prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
-    else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
-
-    if args.backend == "vllm":
-        run_args = [
-            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
-            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.num_scheduler_steps,
-            args.use_v2_block_manager, args.download_dir, args.load_format,
-            args.disable_async_output_proc
+            candidate_ids = [
+                random.randint(0, vocab_size - 1)
+                for _ in range(args.input_len)
            ]
+            # As tokenizer may add additional tokens like BOS, we need to try
+            # different lengths to get the desired input length.
+            for _ in range(5):  # Max attempts to correct
+                candidate_prompt = tokenizer.decode(candidate_ids)
+                tokenized_len = len(tokenizer.encode(candidate_prompt))

-        if args.async_engine:
-            run_args.append(args.disable_frontend_multiprocessing)
-            elapsed_time = uvloop.run(run_vllm_async(*run_args))
+                if tokenized_len == args.input_len:
+                    break
+
+                # Adjust length based on difference
+                diff = args.input_len - tokenized_len
+                if diff > 0:
+                    candidate_ids.extend([
+                        random.randint(100, vocab_size - 100)
+                        for _ in range(diff)
+                    ])
                else:
-            elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
+                    candidate_ids = candidate_ids[:diff]
+            requests.append(
+                SampleRequest(prompt=candidate_prompt,
+                              prompt_len=args.input_len,
+                              expected_output_len=args.output_len))
+    else:
+        requests = sample_requests(tokenizer, args)
+
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                ))
+        else:
+            elapsed_time = run_vllm(requests, args.n,
+                                    EngineArgs.from_cli_args(args))
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code)
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    if is_multi_modal:
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")

    # Output JSON results if specified
    if args.output_json:
@ -389,7 +385,9 @@ if __name__ == "__main__":
    parser.add_argument("--dataset",
                        type=str,
                        default=None,
-                        help="Path to the dataset.")
+                        help="Path to the dataset. The dataset is expected to "
+                        "be a json in form of List[Dict[..., conversations: "
+                        "List[Dict[..., value: <prompt_or_response>]]]]")
    parser.add_argument("--input-len",
                        type=int,
                        default=None,
@ -399,143 +397,23 @@ if __name__ == "__main__":
                        default=None,
                        help="Output length for each request. Overrides the "
                        "output length from the dataset.")
-    parser.add_argument("--model", type=str, default="facebook/opt-125m")
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--n",
                        type=int,
                        default=1,
                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--use-new-beam-search-impl", action="store_true")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,
                        help="Number of prompts to process.")
-    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--hf-max-batch-size",
                        type=int,
                        default=None,
                        help="Maximum batch size for HF backend.")
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument("--enforce-eager",
-                        action="store_true",
-                        help="enforce eager execution")
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
-    parser.add_argument("--device",
-                        type=str,
-                        default="auto",
-                        choices=DEVICE_OPTIONS,
-                        help='device type for vLLM execution')
-    parser.add_argument(
-        "--num-scheduler-steps",
-        type=int,
-        default=1,
-        help="Maximum number of forward steps per scheduler call.")
-    parser.add_argument("--use-v2-block-manager",
-                        action='store_true',
-                        help="Enable block manager v2.")
-    parser.add_argument(
-        "--enable-prefix-caching",
-        action='store_true',
-        help="Enable automatic prefix caching for vLLM backend.")
-    parser.add_argument("--enable-chunked-prefill",
-                        action='store_true',
-                        help="enable chunked prefill for vLLM backend.")
-    parser.add_argument('--max-num-batched-tokens',
-                        type=int,
-                        default=None,
-                        help='maximum number of batched tokens per '
-                        'iteration')
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        "--disable-async-output-proc",
-        action='store_true',
-        default=False,
-        help="Disable async output processor for vLLM backend.")
    parser.add_argument("--async-engine",
                        action='store_true',
                        default=False,
@ -544,6 +422,7 @@ if __name__ == "__main__":
                        action='store_true',
                        default=False,
                        help="Disable decoupled async engine frontend.")
+    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
@ -566,8 +445,6 @@ if __name__ == "__main__":
            raise ValueError("dtype must be auto for MII backend.")
        if args.n != 1:
            raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.hf_max_batch_size is not None:
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -0,0 +1,144 @@
+#!/bin/bash
+
+# benchmark the overhead of disaggregated prefill.
+# methodology:
+# - send all request to prefill vLLM instance. It will buffer KV cache.
+# - then send all request to decode instance. 
+# - The TTFT of decode instance is the overhead.
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill -f pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=10
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # let the prefill instance finish prefill
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "inf"
+
+
+  # send the request to decode.
+  # The TTFT of this command will be the overhead of disagg prefill impl.
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "$qps"
+  kill_gpu_processes
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=1
+  default_output_len=1
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
+  sleep 1
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+launch_chunked_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 round_robin_proxy.py &
+  sleep 1
+}
+
+
+launch_disagg_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 disagg_prefill_proxy_server.py &
+  sleep 1
+}
+
+
+benchmark() {
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=100
+  qps=$1
+  prefix_len=50
+  input_len=1024
+  output_len=$2
+  tag=$3
+
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename "$tag"-qps-"$qps".json \
+          --request-rate "$qps"
+
+  sleep 2
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx matplotlib aiohttp
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt so that we can sample 2048 tokens for input
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_output_len=6
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  launch_chunked_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len chunked_prefill
+  done
+  kill_gpu_processes
+
+  launch_disagg_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len disagg_prefill
+  done
+  kill_gpu_processes
+
+  python3 visualize_benchmark_results.py
+
+}
+
+
+main "$@"
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -0,0 +1,61 @@
+import os
+
+import aiohttp
+from quart import Quart, make_response, request
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+async def forward_request(url, data):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+        async with session.post(url=url, json=data,
+                                headers=headers) as response:
+            if response.status == 200:
+                # if response.headers.get('Transfer-Encoding') == 'chunked':
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(
+                            1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route('/v1/completions', methods=['POST'])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request['max_tokens'] = 1
+
+        # finish prefill
+        async for _ in forward_request('http://localhost:8100/v1/completions',
+                                       prefill_request):
+            continue
+
+        # return decode
+        generator = forward_request('http://localhost:8200/v1/completions',
+                                    original_request_data)
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == '__main__':
+    app.run(port=8000)
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@ -0,0 +1,60 @@
+import asyncio
+import itertools
+
+import aiohttp
+from aiohttp import web
+
+
+class RoundRobinProxy:
+
+    def __init__(self, target_ports):
+        self.target_ports = target_ports
+        self.port_cycle = itertools.cycle(self.target_ports)
+
+    async def handle_request(self, request):
+        target_port = next(self.port_cycle)
+        target_url = f"http://localhost:{target_port}{request.path_qs}"
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                # Forward the request
+                async with session.request(
+                        method=request.method,
+                        url=target_url,
+                        headers=request.headers,
+                        data=request.content,
+                ) as response:
+                    # Start sending the response
+                    resp = web.StreamResponse(status=response.status,
+                                              headers=response.headers)
+                    await resp.prepare(request)
+
+                    # Stream the response content
+                    async for chunk in response.content.iter_any():
+                        await resp.write(chunk)
+
+                    await resp.write_eof()
+                    return resp
+
+            except Exception as e:
+                return web.Response(text=f"Error: {str(e)}", status=500)
+
+
+async def main():
+    proxy = RoundRobinProxy([8100, 8200])
+    app = web.Application()
+    app.router.add_route('*', '/{path:.*}', proxy.handle_request)
+
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8000)
+    await site.start()
+
+    print("Proxy server started on http://localhost:8000")
+
+    # Keep the server running
+    await asyncio.Event().wait()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
--- a/Show More
+++ b/Show More