update

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Remove global CUDA graph pool
2025-11-12 00:54:47 +08:00 · 2025-09-19 23:30:15 +00:00 · 2025-09-19 23:27:51 +00:00
2229 changed files with 162375 additions and 232753 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "HandH1998/QQQ-Llama-3-8b-g128"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.419
-  - name: "exact_match,flexible-extract"
-    value: 0.416
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -1,12 +0,0 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
-model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-backend: "vllm-vlm"
-tasks:
- name: "chartqa"
-  metrics:
-  - name: "relaxed_accuracy,none"
-    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
-    value: 0.80
-limit: 100
-num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -1,10 +0,0 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
-model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-tasks:
- name: "mmlu_pro"
-  metrics:
-  - name: "exact_match,custom-extract"
-    value: 0.80
-limit: 250 # will run on 250 * 14 subjects = 3500 samples
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,5 +1,4 @@
-# For vllm script, with -t option (tensor parallel size)
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
-
-model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
-backend: "vllm-vlm"
-tasks:
- name: "chartqa"
-  metrics:
-  - name: "relaxed_accuracy,none"
-    value: 0.855
-limit: 2500
-num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@ -1,14 +0,0 @@
-model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
-tasks:
-  - name: "mmlu_pro"
-    metrics:
-      - name: "exact_match,custom-extract"
-        value: 0.82
-limit: 250 # will run on 250 * 14 subjects = 3500 samples
-num_fewshot: 5
-enforce_eager: false # we use false to speed up the eval process
-kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
-max_model_len: 40960
-apply_chat_template: true
-fewshot_as_multiturn: true
-gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@ -1 +0,0 @@
-Qwen3-235B-A22B-Instruct-2507-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@ -1 +0,0 @@
-Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@ -1 +0,0 @@
-Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -1,44 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on chartqa for vllm.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.9
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on ChartQA using multimodal vllm."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our correctness tests in vllm's CI."
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -l    - limit number of samples to run"
-    echo "  -t    - tensor parallel size to run at"
-    echo
-}
-
-while getopts "m:l:t:" OPT; do
-  case ${OPT} in
-    m ) 
-        MODEL="$OPTARG"
-        ;;
-    l ) 
-        LIMIT="$OPTARG"
-        ;;
-    t ) 
-        TP_SIZE="$OPTARG"
-        ;;
-    \? ) 
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model vllm-vlm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
-  --tasks chartqa \
-  --batch_size auto \
-  --apply_chat_template \
-  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -1,50 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
-# We use this for fp8, which HF does not support.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our automated nm-test-accuracy workflow"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -l    - limit number of samples to run"
-    echo "  -f    - number of fewshot samples to use"
-    echo "  -t    - tensor parallel size to run at"
-    echo
-}
-
-while getopts "m:b:l:f:t:" OPT; do
-  case ${OPT} in
-    m )
-        MODEL="$OPTARG"
-        ;;
-    b )
-        BATCH_SIZE="$OPTARG"
-        ;;
-    l )
-        LIMIT="$OPTARG"
-        ;;
-    f )
-        FEWSHOT="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model vllm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
-  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
-  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -19,35 +19,21 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
-    batch_size = eval_config.get("batch_size", "auto")
-    backend = eval_config.get("backend", "vllm")
-    enforce_eager = eval_config.get("enforce_eager", "true")
-    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
-        f"enforce_eager={enforce_eager},"
-        f"kv_cache_dtype={kv_cache_dtype},"
+        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len},"
+        f"max_model_len={max_model_len}"
    )
    results = lm_eval.simple_evaluate(
-        model=backend,
+        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
-        # text models. however, this is regressing measured strict-match for
-        # existing text models in CI, so only apply it for mm, or explicitly set
-        apply_chat_template=eval_config.get(
-            "apply_chat_template", backend == "vllm-vlm"
-        ),
-        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
-        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
-        gen_kwargs=eval_config.get("gen_kwargs"),
-        batch_size=batch_size,
+        batch_size="auto",
    )
    return results

--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@ -2,23 +2,40 @@

 ## Introduction

-This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
-vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
+This directory contains two sets of benchmark for vllm.
+
+- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
+- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
+
+See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.

 ## Performance benchmark quick overview

-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.

 **Benchmarking Duration**: about 1hr.

 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.

+## Nightly benchmark quick overview
+
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
+
+**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
+
+**Benchmarking Duration**: about 3.5hrs.
+
 ## Trigger the benchmark

-The benchmark needs to be triggered manually:
+Performance benchmark will be triggered when:
+
+- A PR being merged into vllm.
+- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
+
+Manually Trigger the benchmark

 ```bash
-bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 ```

 Runtime environment variables:
@ -30,6 +47,10 @@ Runtime environment variables:
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.

+Nightly benchmark will be triggered when:
+
+- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
+
 ## Performance benchmark details

 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
@ -131,3 +152,26 @@ Here is an example using the script to compare result_a and result_b with Model,
 A comparison diagram will be generated below the table.
 Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
+
+## Nightly test details
+
+See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
+
+### Workflow
+
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
+- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
+- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
+- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+
+### Nightly tests
+
+In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
+
+### Docker containers
+
+The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
+
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
+
+WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -0,0 +1,184 @@
+steps:
+  - label: "Wait for container to be ready"
+    key: wait-for-container-image
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: badouralix/curl-jq
+            command:
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+  - label: "Cleanup H100"
+    agents:
+      queue: H100
+    depends_on: ~
+    command: docker system prune -a --volumes --force
+  
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  # Premerge benchmark
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
+
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H100
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -0,0 +1,28 @@
+# Nightly benchmark annotation
+
+## Description
+
+This file contains the downloading link for benchmarking results.
+
+- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
+- [benchmarking results](artifact://results.zip)
+- [benchmarking code](artifact://nightly-benchmarks.zip)
+
+Please download the visualization scripts in the post
+
+## Results reproduction
+
+- Find the docker we use in `benchmarking pipeline`
+- Deploy the docker, and inside the docker:
+    - Download `nightly-benchmarks.zip`.
+    - In the same folder, run the following code:
+
+    ```bash
+    export HF_TOKEN=<your HF token>
+    apt update
+    apt install -y git
+    unzip nightly-benchmarks.zip
+    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+    ```
+
+And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -0,0 +1,39 @@
+
+# Nightly benchmark
+
+This benchmark aims to:
+
+- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
+- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
+
+Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
+
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+
+## Setup
+
+- Docker images:
+    - vLLM: `vllm/vllm-openai:v0.6.2`
+    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
+    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+- Hardware
+    - 8x Nvidia A100 GPUs
+- Workload:
+    - Dataset
+        - ShareGPT dataset
+        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+    - Models: llama-3 8B, llama-3 70B.
+        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+## Known issues
+
+- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
+- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -0,0 +1,196 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+
+
+
+  - label: "A100 vllm step 10"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:v0.6.2
+                <<: *common_container_settings
+
+
+
+  - label: "A100 sglang benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: lmsysorg/sglang:v0.3.2-cu121
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
+                <<: *common_container_settings
+
+
+
+
+  - label: "A100 trt llama-8B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama8B"
+
+
+  - label: "A100 trt llama-70B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama70B"
+
+
+  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  #               <<: *common_container_settings
+
+
+  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
+  #               <<: *common_container_settings
+        
+  - wait
+
+  - label: "Collect the results"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - block: ":rocket: check the results!"
--- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@ -7,7 +7,6 @@ from importlib import util

 import pandas as pd

-pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None


@ -110,10 +109,7 @@ def compare_data_columns(
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
-            if "P99" in data_column or "Median" in data_column:
-                ratio = base / current  # for latency
-            else:
-                ratio = current / base
+            ratio = current / base
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
@ -203,71 +199,6 @@ def split_json_by_tp_pp(
    return saved_paths


-def _add_limit_line(fig, y_value, label):
-    # Visible dashed line + annotation
-    fig.add_hline(
-        y=y_value,
-        line_dash="dash",
-        line_color="red" if "ttft" in label.lower() else "blue",
-        annotation_text=f"{label}: {y_value} ms",
-        annotation_position="top left",
-    )
-    # Optional: add a legend item (as a transparent helper trace)
-    if plot and plotly_found:
-        import plotly.graph_objects as go
-
-        fig.add_trace(
-            go.Scatter(
-                x=[None],
-                y=[None],
-                mode="lines",
-                line=dict(
-                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
-                ),
-                name=f"{label}",
-            )
-        )
-
-
-def _find_concurrency_col(df: pd.DataFrame) -> str:
-    for c in [
-        "# of max concurrency.",
-        "# of max concurrency",
-        "Max Concurrency",
-        "max_concurrency",
-        "Concurrency",
-    ]:
-        if c in df.columns:
-            return c
-    # Fallback: guess an integer-like column (harmless if unused)
-    for c in df.columns:
-        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
-            return c
-    return "# of max concurrency."
-
-
-def _highlight_threshold(
-    df: pd.DataFrame, threshold: float
-) -> "pd.io.formats.style.Styler":
-    """Highlight numeric per-configuration columns with value <= threshold."""
-    conc_col = _find_concurrency_col(df)
-    key_cols = [
-        c
-        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
-        if c in df.columns
-    ]
-    conf_cols = [
-        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
-    ]
-    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
-    return df.style.map(
-        lambda v: "background-color:#e6ffe6;font-weight:bold;"
-        if pd.notna(v) and v <= threshold
-        else "",
-        subset=conf_cols,
-    )
-
-
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
@ -289,26 +220,6 @@ if __name__ == "__main__":
        default="# of max concurrency.",
        help="column name to use as X Axis in comparison graph",
    )
-    parser.add_argument(
-        "-l",
-        "--latency",
-        type=str,
-        default="p99",
-        help="take median|p99 for latency like TTFT/TPOT",
-    )
-    parser.add_argument(
-        "--ttft-max-ms",
-        type=float,
-        default=3000.0,
-        help="Reference limit for TTFT plots (ms)",
-    )
-    parser.add_argument(
-        "--tpot-max-ms",
-        type=float,
-        default=100.0,
-        help="Reference limit for TPOT plots (ms)",
-    )
-
    args = parser.parse_args()

    drop_column = "P99"
@ -323,22 +234,12 @@ if __name__ == "__main__":
        "# of max concurrency.",
        "qps",
    ]
-
-    if "median" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "Median TTFT /n",
-            "Median TPOT /n",
-        ]
-        drop_column = "P99"
-    elif "p99" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "P99 TTFT /n",
-            "P99 TPOT /n",
-        ]
+    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+    html_msgs_for_data_cols = [
+        "Compare Output Tokens /n",
+        "Median TTFT /n",
+        "Median TPOT /n",
+    ]

    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
@ -374,83 +275,33 @@ if __name__ == "__main__":
                    f"Expected subset: {filtered_info_cols}, "
                    f"but DataFrame has: {list(output_df.columns)}"
                )
-            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
-            output_df_sorted = output_df.sort_values(by=args.xaxis)
+            output_df_sorted = output_df.sort_values(by=existing_group_cols)
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
            for name, group in output_groups:
-                group_name = (
-                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
-                )
-                group_html_name = "perf_comparison_" + group_name + ".html"
-
-                metric_name = str(data_cols_to_compare[i]).lower()
-                if "tok/s" in metric_name:
-                    html = group.to_html()
-                elif "ttft" in metric_name:
-                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
-                    )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
-                    )
-                elif (
-                    "tpot" in metric_name
-                    or "median" in metric_name
-                    or "p99" in metric_name
-                ):
-                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
-                    )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
-                    )
-
+                html = group.to_html()
                text_file.write(html_msgs_for_data_cols[i])
                text_file.write(html)
-                with open(group_html_name, "a+") as sub_text_file:
-                    sub_text_file.write(html_msgs_for_data_cols[i])
-                    sub_text_file.write(html)

-                    if plot and plotly_found:
-                        import plotly.express as px
+                if plot and plotly_found:
+                    import plotly.express as px

-                        df = group[raw_data_cols]
-                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                        # Melt DataFrame for plotting
-                        df_melted = df_sorted.melt(
-                            id_vars=info_cols[y_axis_index],
-                            var_name="Configuration",
-                            value_name=data_cols_to_compare[i],
-                        )
-                        title = (
-                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                        )
-                        # Create Plotly line chart
-                        fig = px.line(
-                            df_melted,
-                            x=info_cols[y_axis_index],
-                            y=data_cols_to_compare[i],
-                            color="Configuration",
-                            title=title,
-                            markers=True,
-                        )
-
-                        # ---- Add threshold lines based on metric name ----
-                        if "ttft" in metric_name:
-                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-                        elif (
-                            "tpot" in metric_name
-                            or "median" in metric_name
-                            or "p99" in metric_name
-                        ):
-                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
-
-                        # Export to HTML
-                        text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
-                        sub_text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
+                    df = group[raw_data_cols]
+                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                    # Melt DataFrame for plotting
+                    df_melted = df_sorted.melt(
+                        id_vars=info_cols[y_axis_index],
+                        var_name="Configuration",
+                        value_name=data_cols_to_compare[i],
+                    )
+                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                    # Create Plotly line chart
+                    fig = px.line(
+                        df_melted,
+                        x=info_cols[y_axis_index],
+                        y=data_cols_to_compare[i],
+                        color="Configuration",
+                        title=title,
+                        markers=True,
+                    )
+                    # Export to HTML
+                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@ -63,11 +63,9 @@ serving_column_mapping = {
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
-    "std_ttft_ms": "STD TTFT (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "median_tpot_ms": "Median",
    "p99_tpot_ms": "P99",
-    "std_tpot_ms": "STD TPOT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
@ -370,7 +368,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
        )

    # get markdown tables
@ -392,7 +390,7 @@ if __name__ == "__main__":
    json_file = "benchmark_results.json"
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
-            "../.buildkite/performance-benchmarks/"
+            "../.buildkite/nightly-benchmarks/"
            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+from transformers import AutoTokenizer
+
+
+def main(model, cachedir):
+    # Load the tokenizer and save it to the specified directory
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer.save_pretrained(cachedir)
+    print(f"Tokenizer saved to {cachedir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and save Hugging Face tokenizer"
+    )
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+    parser.add_argument(
+        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
+    )
+
+    args = parser.parse_args()
+    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Parse command line arguments for summary-nightly-results script."
+    )
+    parser.add_argument(
+        "--results-folder",
+        type=str,
+        required=True,
+        help="The folder where the results are stored.",
+    )
+    parser.add_argument(
+        "--description", type=str, required=True, help="Description of the results."
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def get_perf(df, method, model, metric):
+    means = []
+
+    for qps in [2, 4, 8, 16, "inf"]:
+        target = df["Test name"].str.contains(model)
+        target = target & df["Engine"].str.contains(method)
+        target = target & df["Test name"].str.contains("qps_" + str(qps))
+        filtered_df = df[target]
+
+        if filtered_df.empty:
+            means.append(0.0)
+        else:
+            means.append(filtered_df[metric].values[0])
+
+    return np.array(means)
+
+
+def get_perf_w_std(df, method, model, metric):
+    if metric in ["TTFT", "ITL"]:
+        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
+        mean = mean.tolist()
+        std = get_perf(df, method, model, "Std " + metric + " (ms)")
+        if std.mean() == 0:
+            std = None
+        success = get_perf(df, method, model, "Successful req.")
+        if std is not None:
+            std = std / np.sqrt(success)
+            std = std.tolist()
+
+    else:
+        assert metric == "Tput"
+        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
+            df, method, model, "Output Tput (tok/s)"
+        )
+        mean = mean.tolist()
+        std = None
+
+    return mean, std
+
+
+def main(args):
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file) as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
+
+    with open(args.description) as f:
+        description = f.read()
+
+    description = description.format(nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from lmdeploy.serve.openai.api_client import APIClient
+
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
--- a/.buildkite/performance-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh
@ -181,14 +181,18 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -0,0 +1,78 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+main() {
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+    (which zip) || (apt-get install -y zip)
+
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip plotting the results."
+        exit 0
+    fi
+
+    # initial annotation
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+    # download results
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    mkdir -p results/
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+    ls
+    ls results/
+
+    # upload benchmark results
+    zip -r results.zip results/
+    /workspace/buildkite-agent artifact upload "results.zip"
+
+    # upload benchmarking scripts
+    cd "$VLLM_SOURCE_CODE_LOC/"
+    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
+    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    # upload benchmarking pipeline
+    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
+    
+
+
+    # The figures should be generated by a separate process outside the CI/CD pipeline
+
+    # # generate figures
+    # python3 -m pip install tabulate pandas matplotlib
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
+    #     --description $description \
+    #     --results-folder results/ 
+
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sharegpt
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_2048_128
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_128_2048
+    
+    # # upload results and figures
+    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -0,0 +1,464 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+get_current_llm_serving_engine() {
+
+  if which lmdeploy >/dev/null; then
+    echo "Container: lmdeploy"
+    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+    return
+  fi
+
+  if [ -e /tgi-entrypoint.sh ]; then
+    echo "Container: tgi"
+    export CURRENT_LLM_SERVING_ENGINE=tgi
+    return
+  fi
+
+  if which trtllm-build >/dev/null; then
+    echo "Container: tensorrt-llm"
+    export CURRENT_LLM_SERVING_ENGINE=trt
+    return
+  fi
+
+  if [ -e /sgl-workspace ]; then
+    echo "Container: sglang"
+    export CURRENT_LLM_SERVING_ENGINE=sglang
+    return
+  fi
+
+  if [ -e /vllm-workspace ]; then
+    echo "Container: vllm"
+    # move to a completely irrelevant directory, to avoid import vllm from current folder
+    export CURRENT_LLM_SERVING_ENGINE=vllm
+
+    return
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+kill_gpu_processes() {
+  pkill -f '[p]ython'
+  pkill -f '[p]ython3'
+  pkill -f '[t]ritonserver'
+  pkill -f '[p]t_main_thread'
+  pkill -f '[t]ext-generation'
+  pkill -f '[l]mdeploy'
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pkill -f '[V]LLM'
+
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+ensure_installed() {
+  # Ensure that the given command is installed by apt-get
+  local cmd=$1
+  if ! which "$cmd" >/dev/null; then
+    apt-get update && apt-get install -y "$cmd"
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `vllm bench serve` command
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    # this is required for lmdeploy.
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+
+
+    # change model name for lmdeploy (it will not follow standard hf name)
+    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      backend=$CURRENT_LLM_SERVING_ENGINE
+
+      if [[ $backend = "trt" ]]; then
+        backend="tensorrt-llm"
+      fi
+
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+
+      if [[ "$dataset_name" = "sharegpt" ]]; then
+
+        client_command="vllm bench serve \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      elif [[ "$dataset_name" = "sonnet" ]]; then
+
+        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
+        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
+        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
+
+        client_command="vllm bench serve \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --sonnet-input-len $sonnet_input_len \
+          --sonnet-output-len $sonnet_output_len \
+          --sonnet-prefix-len $sonnet_prefix_len \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      else
+
+        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
+        exit 1
+
+      fi
+
+
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command="None"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+  done
+
+  kill_gpu_processes
+}
+
+run_genai_perf_tests() {
+  # run genai-perf tests
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend "$backend" \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
+
+prepare_dataset() {
+
+  # download sharegpt dataset
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+
+}
+
+main() {
+
+  # check if the environment variable is successfully injected from yaml
+
+  check_gpus
+  check_hf_token
+  get_current_llm_serving_engine
+
+  pip install -U transformers
+
+  pip install -r requirements/dev.txt
+  which genai-perf
+
+  # check storage
+  df -h
+
+  ensure_installed wget
+  ensure_installed curl
+  ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d
+
+  prepare_dataset
+
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+
+  # run the test
+  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
+
+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
+  # upload benchmark results to buildkite
+  python3 -m pip install tabulate pandas
+  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@ -365,7 +365,8 @@ run_serving_tests() {
      continue
    fi

-    server_command="$server_envs vllm serve \
+    server_command="$server_envs python3 \
+      -m vllm.entrypoints.openai.api_server \
      $server_args"

    # run the server
@ -454,6 +455,11 @@ main() {
  fi
  check_hf_token

+  # Set to v1 to run v1 benchmark
+  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
+    export VLLM_USE_V1=1
+  fi
+
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
@ -469,12 +475,7 @@ main() {
  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
-
-  # dump vllm info via vllm collect-env
-  env_output=$(vllm collect-env)
-
-  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
+  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/

  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "std_ttft_ms": "Std TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "std_tpot_ms": "Std TPOT (ms)",
+    "median_tpot_ms": "Median TPOT (ms)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
+    "engine": "Engine",
+}
+
+if __name__ == "__main__":
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands")) as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )
+
+    serving_md_table_with_headers = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split("\n")
+    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
+
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
+        f.write("\n")
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+        results = serving_results.to_dict(orient="records")
+        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -0,0 +1,23 @@
+#!/bin/sh
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
+if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
+else
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+fi
+
+TIMEOUT_SECONDS=10
+
+retries=0
+while [ $retries -lt 1000 ]; do
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
+        exit 0
+    fi
+
+    echo "Waiting for image to be available..."
+
+    retries=$((retries + 1))
+    sleep 5
+done
+
+exit 1
--- a/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/latency-tests.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests.json
--- a/.buildkite/performance-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/performance-benchmarks/tests/nightly-tests.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
@ -95,38 +95,6 @@
            "num_prompts": 200
        }
    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -265,41 +233,6 @@
            "num_prompts": 1000
        }
    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@ -432,38 +365,6 @@
            "num_prompts": 200
        }
    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -602,41 +503,6 @@
            "num_prompts": 1000
        }
    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@ -772,39 +638,6 @@
            "num_prompts": 200
        }
    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -947,42 +780,6 @@
            "num_prompts": 1000
        }
    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
        "qps_list": ["inf"],
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@ -2,7 +2,7 @@
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -28,13 +28,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
+            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -60,13 +60,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
+            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_tp1_random_128_128",
+        "test_name": "serving_llama8B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -76,7 +76,39 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@ -92,16 +124,16 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 128,
+	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 32
+            "num_prompts": 100
        }
    },
    {
-        "test_name": "serving_llama8B_tp2_random_128_128",
+        "test_name": "serving_llama8B_pp6_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -111,7 +143,7 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 6,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@ -127,150 +159,10 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 128,
+	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
+            "num_prompts": 100
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests.json
--- a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
@ -1,26 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp2",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
@ -1,27 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp2",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -0,0 +1,46 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.format]
+docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,5 +1,5 @@
 steps:
-  # aarch64 + CUDA builds
+  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build arm64 wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
@ -8,28 +8,13 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  # aarch64 build
-  - label: "Build arm64 CPU wheel"
-    depends_on: ~
-    id: build-wheel-arm64-cpu
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
@ -43,33 +28,33 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - label: "Build wheel - CUDA 12.6"
+    depends_on: ~
+    id: build-wheel-cuda-12-6
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build wheel - CUDA 13.0"
-    depends_on: ~
-    id: build-wheel-cuda-13-0
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # Build release images (12.9)
  - label: "Build release image (x86)"
    depends_on: ~
    id: build-release-image-x86
@ -77,12 +62,13 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

+  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
@ -90,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"

  # Add job to create multi-arch manifest
@ -156,22 +142,6 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build arm64 CPU release image"
-    key: block-arm64-cpu-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish arm64 CPU release image"
-    depends_on: block-arm64-cpu-release-image-build
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
-
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
@ -180,16 +150,11 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
-      - "docker push vllm/vllm-openai:nightly-aarch64"
-      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest push vllm/vllm-openai:nightly"
-      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker push vllm/vllm-openai:nightly"
+      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
@ -198,4 +163,3 @@ steps:
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
-      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -8,41 +8,20 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"

-# Get DockerHub credentials from environment
+# Get DockerHub token from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi

-if [ -z "$DOCKERHUB_USERNAME" ]; then
-    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
-    exit 1
-fi
-
-# Get DockerHub bearer token
-echo "Getting DockerHub bearer token..."
-set +x
-BEARER_TOKEN=$(curl -s -X POST \
-    -H "Content-Type: application/json" \
-    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
-    "https://hub.docker.com/v2/users/login" | jq -r '.token')
-set -x
-
-if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
-    echo "Error: Failed to get DockerHub bearer token"
-    exit 1
-fi
-
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    
    while true; do
-        set +x
-        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
+        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
-        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -64,9 +43,7 @@ delete_tag() {
    echo "Deleting tag: $tag_name"
    
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    set +x
-    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
-    set -x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
    
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

+if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
+  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
+fi
+
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -25,28 +25,25 @@ function cpu_tests() {

  # offline inference
  podman exec -it "$container_id" bash -c "
-    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

  # Run basic model test
  podman exec -it "$container_id" bash -c "
-    set -evx
+    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
-
-    # Note: disable Bart until supports V1
-    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }

 # All of CPU tests are expected to be finished less than 40 mins.

 export container_id
 export -f cpu_tests
-timeout 120m bash -c cpu_tests
+timeout 40m bash -c cpu_tests

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -58,8 +58,11 @@ function cpu_tests() {
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

-    pytest -x -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
+    # Note: disable Bart until supports V1
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py

    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
@ -70,7 +73,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"

  # Note: disable it until supports V1
  # Run AWQ test
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -1,191 +0,0 @@
-#!/bin/bash
-
-# This script build the Ascend NPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Base ubuntu image with basic ascend development libraries and python installed
-VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
-CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
-TEST_RUN_CONFIG_FILE="vllm_test.cfg"
-VLLM_ASCEND_TMP_DIR=
-# Get the test run configuration file from the vllm-ascend repository
-fetch_vllm_test_cfg() {
-    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
-    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
-    cleanup() {
-        rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    }
-    trap cleanup EXIT
-
-    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
-    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
-        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-
-    # If the file already exists locally, just overwrite it
-    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
-    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
-
-    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
-    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
-    rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    trap - EXIT
-}
-
-# Downloads test run configuration file from a remote URL.
-# Loads the configuration into the current script environment.
-get_config() {
-    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
-        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-    source "${TEST_RUN_CONFIG_FILE}"
-    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
-    return 0
-}
-
-# get test running configuration.
-fetch_vllm_test_cfg
-get_config
-# Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
-  exit 1
-fi
-
-image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
-container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
-agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
-echo "agent_idx: ${agent_idx}"
-builder_name="cachebuilder${agent_idx}"
-builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
-
-# Try building the docker image
-cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
-    --progress=plain --load -t ${image_name} -f - .
-FROM ${BASE_IMAGE_NAME}
-
-# Define environments
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
-    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
-    apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install for pytest to make the docker build cache layer always valid
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install pytest>=6.0  modelscope
-
-WORKDIR /workspace/vllm
-
-# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-COPY requirements/common.txt /workspace/vllm/requirements/common.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements/common.txt
-
-COPY . .
-
-# Install vLLM
-RUN --mount=type=cache,target=/root/.cache/pip \
-    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
-    python3 -m pip uninstall -y triton
-
-# Install vllm-ascend
-WORKDIR /workspace
-ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
-ARG VLLM_ASCEND_TAG=main
-RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
-    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
-
-# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r /workspace/vllm-ascend/requirements.txt
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
-    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
-    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV VLLM_USE_MODELSCOPE=True
-
-WORKDIR /workspace/vllm-ascend
-
-CMD ["/bin/bash"]
-
-EOF
-
-# Setup cleanup
-remove_docker_container() {
-  docker rm -f "${container_name}" || true;
-  docker image rm -f "${image_name}" || true;
-  docker system prune -f || true;
-}
-trap remove_docker_container EXIT
-
-# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
-# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
-#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
-parse_and_gen_devices() {
-    local input="$1"
-    local index cards_num
-    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
-        index="${BASH_REMATCH[1]}"
-        cards_num="${BASH_REMATCH[2]}"
-    else
-        echo "parse error" >&2
-        return 1
-    fi
-
-    local devices=""
-    local i=0
-    while (( i < cards_num )); do
-        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
-        ((i++))
-    done
-
-    # trim leading space
-    devices="${devices#"${devices%%[![:space:]]*}"}"
-    # Output devices: assigned to the caller variable
-    printf '%s' "$devices"
-}
-
-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
-
-# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
-# This test checks whether the OOT platform interface is functioning properly in conjunction with
-# the hardware plugin vllm-ascend.
-model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
-docker run \
-    ${devices} \
-    --device /dev/davinci_manager \
-    --device /dev/devmm_svm \
-    --device /dev/hisi_hdc \
-    -v /usr/local/dcmi:/usr/local/dcmi \
-    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
-    --entrypoint="" \
-    --name "${container_name}" \
-    "${image_name}" \
-    bash -c '
-    set -e
-    pytest -v -s tests/e2e/vllm_interface/
-'
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -62,11 +62,12 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
+    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
-
+export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"

 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -62,11 +62,12 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
+    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
-
+export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"

 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -20,10 +20,7 @@ trap remove_docker_container EXIT

 # Run the image and test offline inference/tensor parallel
 docker run \
-    --device /dev/dri:/dev/dri \
-    --net=host \
-    --ipc=host \
-    --privileged \
+    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    -e "HF_TOKEN=${HF_TOKEN}" \
@ -38,14 +35,16 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_serial_utils.py
+    pytest -v -s v1/test_utils.py
+    pytest -v -s v1/test_metrics_reader.py
 '
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?

 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -1,59 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Setup script for Prime-RL integration tests
-# This script prepares the environment for running Prime-RL tests with nightly vLLM
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
-PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
-PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
-
-echo "Setting up Prime-RL integration test environment..."
-
-# Clean up any existing Prime-RL directory
-if [ -d "${PRIME_RL_DIR}" ]; then
-    echo "Removing existing Prime-RL directory..."
-    rm -rf "${PRIME_RL_DIR}"
-fi
-
-# Install UV if not available
-if ! command -v uv &> /dev/null; then
-    echo "Installing UV package manager..."
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    source $HOME/.local/bin/env
-fi
-
-# Clone Prime-RL repository at specific branch for reproducible tests
-PRIME_RL_BRANCH="integ-vllm-main"
-echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
-git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
-cd "${PRIME_RL_DIR}"
-
-echo "Setting up UV project environment..."
-export UV_PROJECT_ENVIRONMENT=/usr/local
-ln -s /usr/bin/python3 /usr/local/bin/python
-
-# Remove vllm pin from pyproject.toml
-echo "Removing vllm pin from pyproject.toml..."
-sed -i '/vllm==/d' pyproject.toml
-
-# Sync Prime-RL dependencies
-echo "Installing Prime-RL dependencies..."
-uv sync --inexact && uv sync --inexact --all-extras
-
-# Verify installation
-echo "Verifying installations..."
-uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
-uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
-
-echo "Prime-RL integration test environment setup complete!"
-
-echo "Running Prime-RL integration tests..."
-export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
-uv run pytest -vs tests/integration/test_rl.py -m gpu
-
-echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=8.7
+EXPECTED_THROUGHPUT=10.0
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -42,7 +42,7 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo

-vllm serve $MODEL \
+VLLM_USE_V1=1 vllm serve $MODEL \
 --seed 42 \
 --max-num-seqs $MAX_NUM_SEQS \
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -58,25 +58,33 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"

-if [[ $normal_wheel == *"cu129"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
+else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
 fi

 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"

-if [[ $normal_wheel == *"cu129"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
+else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
 fi

 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -6,28 +6,24 @@
 # to generate the final pipeline yaml file.

 # Documentation
-# label(str): the name of the test. emojis allowed.
-# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
-# fast_check_only(bool): run this test on the fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
-# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
+# label(str): the name of the test. emoji allowed.
+# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
+# fast_check_only(bool): run this test on fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for the test. incompatible with command.
-# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
-# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
-# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
-#     in this case, commands must be specified. the first command runs on the first host, the second
+# commands(list): the list of commands to run for test. incompatbile with command.
+# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
+# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
+# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on first host, the second
 #     command runs on the second host.
-# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
-# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
-#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
-# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.

 # When adding a test
-# - If the test belongs to an existing group, add it there
+# - If the test belong to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
@ -38,7 +34,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  # in /vllm/tools/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@ -50,28 +46,19 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
-  - tests/multimodal
-  - tests/utils_
-  commands:
-  - pytest -v -s -m 'not cpu_test' multimodal
-  - pytest -v -s utils_
-
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/utils_
  - tests/standalone_tests/lazy_imports.py
  - tests/transformers_utils
-  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s transformers_utils
+  - pytest -v -s multimodal
+  - pytest -v -s utils_ # Utils
+  - pytest -v -s transformers_utils # transformers_utils

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@ -123,7 +110,7 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
@ -161,6 +148,7 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
+  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
@ -168,36 +156,28 @@ steps:
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/distributed
+  - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
+  - tests/v1/test_internal_lb_dp.py
+  - tests/v1/test_hybrid_lb_dp.py
  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_symm_mem_allreduce.py
  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  - export NCCL_CUMEM_HOST_ENABLE=0
-  # test with torchrun tp=2 and external_dp=2
+  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=2 and pp=2
+  # test with tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=4 and dp=1
-  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2, pp=2 and dp=1
-  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=1 and dp=4 with ep
-  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2 and dp=2 with ep
-  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
-  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@ -205,24 +185,6 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

- label: Distributed Tests (8 GPUs) # 4min
-  timeout_in_minutes: 10
-  gpu: h100
-  num_gpus: 8
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - examples/offline_inference/torchrun_dp_example.py
-  - vllm/config/parallel.py
-  - vllm/distributed/
-  - vllm/v1/engine/llm_engine.py
-  - vllm/v1/executor/uniproc_executor.py
-  - vllm/v1/worker/gpu_worker.py
-  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  - export NCCL_CUMEM_HOST_ENABLE=0
-  # test with torchrun tp=2 and dp=4 with ep
-  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@ -316,44 +278,24 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
-    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/kv_connector/unit
+    - pytest -v -s v1/metrics
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_metrics_reader.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

- label: V1 Test attention (H100) # 10min
-  timeout_in_minutes: 30
-  gpu: h100
-  source_file_dependencies:
-    - vllm/v1/attention
-    - tests/v1/attention
-  commands:
-    - pytest -v -s v1/attention
-
- label: V1 Test others (CPU) # 5 mins
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  no_gpu: true
-  commands:
-    # split the test to avoid interference
-    - pytest -v -s -m 'cpu_test' v1/core
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'cpu_test' v1/metrics
-
-
 - label: Examples Test # 30min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
@ -372,14 +314,12 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -414,12 +354,7 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py \
-      --ignore=lora/test_olmoe_tp.py \
-      --ignore=lora/test_deepseekv2_tp.py \
-      --ignore=lora/test_gptoss.py \
-      --ignore=lora/test_qwen3moe_tp.py
-
+      --ignore=lora/test_llm_with_multi_loras.py
  parallelism: 4

 - label: PyTorch Compilation Unit Tests # 15min
@ -433,12 +368,12 @@ steps:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s compile/test_sequence_parallelism.py
+    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -451,8 +386,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/

- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 35
+- label: PyTorch Fullgraph Test # 20min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -460,19 +395,6 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
-
- label: Cudagraph test
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - tests/v1/cudagraph
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/config/compilation.py
-  - vllm/compilation
-  commands:
-    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
-    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@ -480,9 +402,8 @@ steps:
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
-  - tests/kernels/test_top_k_per_row.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core

 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
@ -516,8 +437,6 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
-  - vllm/envs.py
-  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@ -528,23 +447,33 @@ steps:
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
-  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba

- label: Model Executor Test # 23min
-  timeout_in_minutes: 35
+- label: Tensorizer Test # 14min
+  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
-  - vllm/model_executor
-  - tests/model_executor
+  - vllm/model_executor/model_loader
+  - tests/tensorizer_loader
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
+    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

+- label: Model Executor Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/model_executor
+  - tests/model_executor
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
@ -577,9 +506,8 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -607,17 +535,10 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/tool_use
+    - tests/mistral_tool_use
  commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
+    - pytest -v -s mistral_tool_use

 #####  models test  #####

@ -657,19 +578,13 @@ steps:
  - vllm/
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
-  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
-
- label: Basic Models Test (Other CPU) # 5min
-  timeout_in_minutes: 10
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  no_gpu: true
  commands:
-    - pytest -v -s models/test_utils.py models/test_vision.py
+    - pytest -v -s models/test_transformers.py \
+                   models/test_registry.py \
+                   models/test_utils.py \
+                   models/test_vision.py

 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
@ -728,10 +643,8 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install fast path packages for testing against transformers
-    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
@ -786,16 +699,6 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

- label: Multi-Modal Accuracy Eval (Small Models) # 50min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
-  commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@ -851,16 +754,14 @@ steps:
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
-    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-    # Whisper needs spawn method to avoid deadlock
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 21 min
-  timeout_in_minutes: 30
+- label: Blackwell Test # 38 min
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -873,6 +774,8 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/fusion.py
+  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -889,38 +792,19 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-
- label: Blackwell Fusion Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
+    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

- label: Blackwell GPT-OSS Eval
+- label: GPT-OSS Eval (Blackwell)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
-  optional: true # run on nightlies
+  optional: true # disable while debugging
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
@ -928,34 +812,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
-
- label: Blackwell Quantized MoE Test
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - tests/quantization/test_blackwell_moe.py
-  - vllm/model_executor/models/deepseek_v2.py
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/models/llama4.py
-  - vllm/model_executor/layers/fused_moe
-  - vllm/model_executor/layers/quantization/compressed_tensors
-  - vllm/model_executor/layers/quantization/modelopt.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - pytest -s -v tests/quantization/test_blackwell_moe.py
-
- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -999,61 +856,47 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

- label: Distributed Tests (2 GPUs) # 68min
-  timeout_in_minutes: 90
+- label: Distributed Tests (2 GPUs) # 110min
+  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
-  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
-  - vllm/worker/worker_base.py
-  - vllm/v1/engine/
-  - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
-  - tests/compile/test_wrapper.py
+  - vllm/model_executor/models/
  - tests/distributed/
-  - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/distributed
+  - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
-  - tests/v1/shutdown
-  - tests/v1/worker/test_worker_memory_snapshot.py
+  - vllm/v1/engine/
  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  - export NCCL_CUMEM_HOST_ENABLE=0
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
-
- label: Distributed Model Tests (2 GPUs) # 37min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/model_executor/model_loader/sharded_state_loader.py
-  - vllm/model_executor/models/
-  - tests/basic_correctness/
-  - tests/model_executor/model_loader/test_sharded_state_loader.py
-  - tests/models/
-  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
+  # test sequence parallel
+  - pytest -v -s distributed/test_sequence_parallel.py
+  # this test fails consistently.
+  # TODO: investigate and fix
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s models/multimodal/generation/test_maverick.py

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
@ -1074,11 +917,6 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
-  # begin stat_logger plugins test
-  - pip install -e ./plugins/vllm_add_dummy_stat_logger
-  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
-  - pip uninstall dummy_stat_logger -y
-  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -1118,7 +956,6 @@ steps:
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
-    - pytest -v -s -x lora/test_olmoe_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
@ -1145,17 +982,6 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

- label: NixlConnector PD accuracy tests (Distributed) # 30min
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
-

 ##### multi gpus test #####
 ##### A100 test #####
@ -1186,30 +1012,13 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

-##### H100 test #####
- label: LM Eval Large Models (H100) # optional
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
 ##### H200 test #####
- label: Distributed Tests (H200) # optional
+- label: Distrubted Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048

@ -1221,16 +1030,3 @@ steps:
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-
-##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
-  timeout_in_minutes: 30
-  optional: true
-  num_gpus: 2
-  working_dir: "/vllm-workspace"
-  source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
-  commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.coveragerc
+++ b/.coveragerc
@ -1,10 +1,5 @@
 [run]
-# Track the installed vllm package (this is what actually gets imported during tests)
-# Use wildcard pattern to match the installed location
-source =
-    vllm
-    */dist-packages/vllm
-    */site-packages/vllm
+source = vllm
 omit =
    */tests/*
    */test_*
@ -17,16 +12,6 @@ omit =
    */benchmarks/*
    */docs/*

-[paths]
-# Map all possible vllm locations to a canonical "vllm" path
-# This ensures coverage.combine properly merges data from different test runs
-source =
-    vllm
-    /vllm-workspace/src/vllm
-    /vllm-workspace/vllm
-    */site-packages/vllm
-    */dist-packages/vllm
-
 [report]
 exclude_lines =
    pragma: no cover
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -1,4 +0,0 @@
-# Migrate from `yapf` & `isort` to `ruff`
-d6953beb91da4e9c99be4c0a1304a2d24189535c
-# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
-8fcaaf6a165e661f63fc51be906bc05b0767332f
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -4,12 +4,19 @@
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/attention @LucasWilkinson
+/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@ -21,17 +28,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1/attention @LucasWilkinson
-/vllm/v1/attention/backends/mla @pavanimajety
-/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
+/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
-/vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC

@ -45,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
@ -53,12 +57,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
-/tests/v1/kv_connector/nixl_integration @NickLucche
+/tests/v1/kv_connector/nixl_integration @NickLucche 
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC

 # Transformers backend
-/vllm/model_executor/models/transformers @hmellor
+/vllm/model_executor/models/transformers.py @hmellor
 /tests/models/test_transformers.py @hmellor

 # Docs
@ -71,7 +75,6 @@ mkdocs.yaml @hmellor
 # Linting
 .markdownlint.yaml @hmellor
 .pre-commit-config.yaml @hmellor
-/tools/pre_commit @hmellor

 # CPU
 /vllm/v1/worker/cpu* @bigPYJ1151
@ -119,11 +122,3 @@ mkdocs.yaml @hmellor

 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
-
-# Pooling models
-/examples/*/pooling/ @noooop
-/tests/models/*/pooling* @noooop
-/tests/entrypoints/pooling @noooop
-/vllm/config/pooler.py @noooop
-/vllm/pooling_params.py @noooop
-/vllm/model_executor/layers/pooler.py @noooop
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -43,6 +43,10 @@ body:
      Any other things you would like to mention.
  validations:
    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -2,7 +2,6 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
-    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@ -11,13 +10,10 @@ pull_request_rules:
    label:
      add:
        - documentation
-    comment:
-      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"

 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
-    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@ -34,7 +30,6 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@ -51,7 +46,6 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
-    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@ -61,7 +55,6 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@ -77,7 +70,6 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@ -91,7 +83,6 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
-    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@ -103,12 +94,11 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
      - files~=^tests/benchmarks/
-      - files~=^\.buildkite/performance-benchmarks/
+      - files~=^\.buildkite/nightly-benchmarks/
  actions:
    label:
      add:
@ -117,7 +107,6 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@ -132,7 +121,6 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
@ -154,7 +142,6 @@ pull_request_rules:
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
-    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@ -175,7 +162,6 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@ -195,7 +181,6 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@ -211,7 +196,6 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@ -224,7 +208,6 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
-    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@ -240,7 +223,6 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
-    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@ -255,9 +237,9 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
-    - label != stale
    - or:
      - files~=^tests/tool_use/
+      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
@ -274,9 +256,8 @@ pull_request_rules:

 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-    - label != stale
-    - conflict
-    - -closed
+      - conflict
+      - -closed
  actions:
    label:
      add:
@ -290,12 +271,10 @@ pull_request_rules:

 - name: assign reviewer for tensorizer changes
  conditions:
-    - label != stale
-    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
-      - files~=^tests/model_executor/model_loader/tensorizer_loader/
+      - files~=^tests/tensorizer_loader/
  actions:
    assign:
      users:
@ -303,7 +282,6 @@ pull_request_rules:

 - name: assign reviewer for modelopt changes
  conditions:
-    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -318,8 +296,8 @@ pull_request_rules:

 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-    - -conflict
-    - -closed
+      - -conflict
+      - -closed
  actions:
    label:
      remove:
@ -328,7 +306,6 @@ pull_request_rules:
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/online_serving/disaggregated[^/]*/.*
      - files~=^examples/offline_inference/disaggregated[^/]*/.*
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,7 +13,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
-        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@ -43,6 +42,7 @@ jobs:
                    searchIn: "body"
                  },
                ],
+                
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@ -89,12 +89,14 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
+                  
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
+                
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@ -105,17 +107,13 @@ jobs:
                  }
                ],
              },
-              // Add more label configurations here as needed
-              // example: {
-              //   keywords: [...],
-              //   substrings: [...],
-              //   regexPatterns: [...]
-              // },
            };
+            
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+              
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@ -127,13 +125,16 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
+            
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
+              
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
+                
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@ -145,17 +146,21 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
+                
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
+                
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
+                
                const termMatches = [];
+                
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@ -170,14 +175,15 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ?
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+                        context: line.length > 100 ? 
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
                          : line.trim()
                      });
                    });
                  }
                });
+                
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@ -190,48 +196,64 @@ jobs:
                  });
                }
              }
+              
              return matches;
            }
+            
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
+              
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
+              
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
+              
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
+              
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+              
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
+                
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+                
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
+              
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
+                
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+                
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
+              
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
+                
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+                  
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
+                  
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -244,6 +266,7 @@ jobs:
                    }
                  });
                }
+                
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -251,10 +274,13 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+                
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
+              
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
+              
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@ -270,92 +296,14 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
+              
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
+            
            // Process all configured labels
-            const labelsAddedResults = await Promise.all(
-              Object.entries(labelConfig).map(([labelName, config]) => 
-                processLabel(labelName, config).then(added => ({ labelName, added }))
-              )
-            );
-            
-            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
-            
-            // Return which labels were added for the next step
-            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
-            core.setOutput('labels_added', JSON.stringify(addedLabels));
-            return addedLabels;
-
-      - name: CC users for labeled issues
-        if: steps.label-step.outputs.labels_added != '[]'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
-        with:
-          script: |
-            // Configuration: Map labels to GitHub users to CC
-            // You can add multiple users per label, and multiple label configurations
-            const ccConfig = {
-              rocm: {
-                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
-                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
-              },
-              // Add more label -> user mappings here
-              // Example:
-              // cuda: {
-              //   users: ['user1', 'user2'],
-              //   message: 'CC {users} for CUDA-related issue'
-              // },
-              // performance: {
-              //   users: ['perfexpert'],
-              //   message: 'CC {users} for performance issue'
-              // },
-            };
-            
-            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
-            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
-            
-            // Get existing comments to check for already mentioned users
-            const comments = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-            });
-            
-            const issueBody = context.payload.issue.body || '';
-            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
-            
-            // Process each label that was added
-            for (const label of labelsAdded) {
-              if (ccConfig[label]) {
-                const config = ccConfig[label];
-                const usersToMention = [];
-                
-                // Check which users haven't been mentioned yet
-                for (const user of config.users) {
-                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
-                  if (!mentionPattern.test(allExistingText)) {
-                    usersToMention.push(user);
-                  } else {
-                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
-                  }
-                }
-                
-                // Post comment if there are users to mention
-                if (usersToMention.length > 0) {
-                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
-                  const message = config.message.replace('{users}', mentions);
-                  
-                  await github.rest.issues.createComment({
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    issue_number: context.issue.number,
-                    body: message
-                  });
-                  
-                  core.notice(`CC comment added for label "${label}": ${mentions}`);
-                } else {
-                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
-                }
-              }
-            }
+            const processLabels = Object.entries(labelConfig)
+              .map(([labelName, config]) => processLabel(labelName, config));
+            const labelsAdded = await Promise.all(processLabels);
+            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.gitignore
+++ b/.gitignore
@ -94,9 +94,6 @@ ipython_config.py
 # generated files
 **/generated/**

-# uv
-uv.lock
-
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@ -4,6 +4,7 @@ MD013: false
 MD024:
  siblings_only: true
 MD033: false
+MD042: false
 MD045: false
 MD046: false
 MD051: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,19 +6,30 @@ default_stages:
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.14.0
+- repo: https://github.com/google/yapf
+  rev: v0.43.0
  hooks:
-  - id: ruff-check
+  - id: yapf
+    args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.buildkite|benchmarks|build|examples)/.*'
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.7
+  hooks:
+  - id: ruff
    args: [--output-format, github, --fix]
  - id: ruff-format
+    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.35.5
  hooks:
  - id: typos
-    args: [--force-exclude]
+- repo: https://github.com/PyCQA/isort
+  rev: 6.0.1
+  hooks:
+  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v21.1.2
+  rev: v20.1.3
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -35,55 +46,61 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.9.1
+  rev: 0.6.17
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
-    entry: python tools/pre_commit/generate_nightly_torch_test.py
+    entry: python tools/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
-    name: Run mypy locally for lowest supported Python version
-    entry: python tools/pre_commit/mypy.py 0 "3.10"
+    name: Run mypy for local Python installation
+    entry: tools/mypy.sh 0 "local"
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
    stages: [pre-commit] # Don't run in CI
-    <<: &mypy_common
-      language: python
-      types_or: [python, pyi]
-      require_serial: true
-      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.9
+    entry: tools/mypy.sh 1 "3.9"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
-    entry: python tools/pre_commit/mypy.py 1 "3.10"
-    <<: *mypy_common
+    entry: tools/mypy.sh 1 "3.10"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
-    entry: python tools/pre_commit/mypy.py 1 "3.11"
-    <<: *mypy_common
+    entry: tools/mypy.sh 1 "3.11"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
-    entry: python tools/pre_commit/mypy.py 1 "3.12"
-    <<: *mypy_common
-    stages: [manual] # Only run in CI
-  - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.13
-    entry: python tools/pre_commit/mypy.py 1 "3.13"
-    <<: *mypy_common
+    entry: tools/mypy.sh 1 "3.12"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
-    entry: tools/pre_commit/shellcheck.sh
+    entry: tools/shellcheck.sh
    language: script
    types: [shell]
  - id: png-lint
    name: Lint PNG exports from excalidraw
-    entry: tools/pre_commit/png-lint.sh
+    entry: tools/png-lint.sh
    language: script
    types: [png]
  - id: signoff-commit
@ -100,12 +117,12 @@ repos:
    stages: [commit-msg]
  - id: check-spdx-header
    name: Check SPDX headers
-    entry: python tools/pre_commit/check_spdx_header.py
+    entry: python tools/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
-    entry: python tools/pre_commit/check_init_lazy_imports.py
+    entry: python tools/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
@ -119,11 +136,11 @@ repos:
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
-    entry: tools/pre_commit/update-dockerfile-graph.sh
+    entry: tools/update-dockerfile-graph.sh
    language: script
  - id: enforce-import-regex-instead-of-re
    name: Enforce import regex as re
-    entry: python tools/pre_commit/enforce_regex_import.py
+    entry: python tools/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
@ -131,20 +148,21 @@ repos:
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
-    entry: python tools/pre_commit/check_triton_import.py
+    entry: python tools/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/pre_commit/check_pickle_imports.py
+    entry: python tools/check_pickle_imports.py
    language: python
    types: [python]
-    additional_dependencies: [regex]
+    pass_filenames: false
+    additional_dependencies: [pathspec, regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/pre_commit/validate_config.py
+    entry: python tools/validate_config.py
    language: python
    additional_dependencies: [regex]
  # Keep `suggestion` last
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -13,7 +13,6 @@ build:

 mkdocs:
  configuration: mkdocs.yaml
-  fail_on_warning: true

 # Optionally declare the Python requirements required to build your docs
 python:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")

 #
 # Try to find python package with an executable that exactly matches
@ -86,9 +86,6 @@ find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
-   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
-  set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
-elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
@ -178,15 +175,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

-#
-# Set compression mode for CUDA >=13.x.
-#
-if(VLLM_GPU_LANG STREQUAL "CUDA" AND
-   DEFINED CMAKE_CUDA_COMPILER_VERSION AND
-   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
-  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
-endif()
-
 #
 # Set CUDA include flags for CXX compiler.
 #
@ -269,8 +257,8 @@ set(VLLM_EXT_SRC
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/w8a8/int8/scaled_quant.cu"
-  "csrc/quantization/w8a8/fp8/common.cu"
+  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
@ -282,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -314,13 +302,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
-    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
-    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
+    "csrc/quantization/fp8/per_token_group_quant.cu")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -424,11 +412,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
-       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -452,16 +440,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -486,16 +470,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -526,7 +506,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -570,11 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
  # CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -593,11 +569,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # FP4 Archs and flags
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -619,11 +591,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # CUTLASS MLA Archs and flags
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
@ -649,7 +617,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -667,13 +635,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -692,13 +656,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # moe_data.cu is used by all CUTLASS MoE kernels.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -715,13 +675,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -883,7 +839,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -1008,7 +963,6 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
-    include(cmake/external_projects/qutlass.cmake)

    # vllm-flash-attn should be last as it overwrites some CMake functions
    include(cmake/external_projects/vllm_flash_attn.cmake)
--- a/README.md
+++ b/README.md
@ -21,8 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio

 *Latest News* 🔥

- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@ -150,7 +148,6 @@ Compute Resources:
 - Trainy
 - UC Berkeley
 - UC San Diego
- Volcengine

 Slack Sponsor: Anyscale

--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -74,7 +74,7 @@ start_server() {
    local vllm_log=$4
    local profile_dir=$5

-    pkill -if "vllm serve" || true
+    pkill -if vllm

    # Define the common arguments as a bash array.
    # Each argument and its value are separate elements.
@ -96,22 +96,17 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
-        VLLM_SERVER_DEV_MODE=1 \
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
-    local server_pid=$!

    # wait for 10 minutes...
    server_started=0
    for i in {1..60}; do
-        # This line checks whether the server is still alive or not,
-        # since that we should always have permission to send signal to the server process.
-        kill -0 $server_pid 2> /dev/null || break
-
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -123,7 +118,7 @@ start_server() {
    done

    if (( ! server_started )); then
-        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
        return 1
    else
        return 0
@ -139,7 +134,7 @@ run_benchmark() {
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
-    pkill -if "vllm serve" || true
+    pkill -if vllm

    echo "starting server..."
    # Call start_server without a profile_dir to avoid profiling overhead
@ -232,7 +227,7 @@ run_benchmark() {

    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"

-    pkill -if "vllm serve" || true
+    pkill -if vllm
    sleep 10
    echo "===================="
    return 0
@ -308,6 +303,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
 else
    echo "No configuration met the latency requirements. Skipping final profiling run."
 fi
-pkill -if "vllm serve" || true
+pkill -if vllm
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -8,6 +8,7 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
+from typing import Optional, Union

 import aiohttp
 import huggingface_hub.constants
@ -27,13 +28,13 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
-    model_name: str | None = None
-    logprobs: int | None = None
-    extra_body: dict | None = None
-    multi_modal_content: dict | list[dict] | None = None
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict | list[dict]] = None
    ignore_eos: bool = False
-    language: str | None = None
-    request_id: str | None = None
+    language: Optional[str] = None
+    request_id: Optional[str] = None


@dataclass
@ -51,7 +52,7 @@ class RequestFuncOutput:

 async def async_request_tgi(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -132,7 +133,7 @@ async def async_request_tgi(

 async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -203,7 +204,7 @@ async def async_request_trt_llm(

 async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(

 async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -366,7 +367,7 @@ async def async_request_openai_completions(

 async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("chat/completions", "profile")), (
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(

 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
@ -609,7 +610,7 @@ def get_tokenizer(
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    **kwargs,
-) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    if pretrained_model_name_or_path is not None and not os.path.exists(
        pretrained_model_name_or_path
    ):
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -2,10 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc

-from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from benchmark_utils import TimeCollector
+from vllm.utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool


--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -46,7 +46,7 @@ import time

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser


 def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -1,31 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-import time
-from unittest import mock

 import numpy as np
-from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from vllm.config import (
-    CacheConfig,
-    DeviceConfig,
-    LoadConfig,
-    ModelConfig,
-    ParallelConfig,
-    SchedulerConfig,
-    SpeculativeConfig,
-    VllmConfig,
-)
-from vllm.platforms import current_platform
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from benchmark_utils import TimeCollector
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.worker.gpu_input_batch import InputBatch
-from vllm.v1.worker.gpu_model_runner import GPUModelRunner


-def benchmark_propose(args):
+def main(args):
    rows = []
    for max_ngram in args.max_ngram:
        collector = TimeCollector(TimeCollector.US)
@ -83,88 +69,10 @@ def benchmark_propose(args):
    )


-def benchmark_batched_propose(args):
-    NUM_SPECULATIVE_TOKENS_NGRAM = 10
-    PROMPT_LOOKUP_MIN = 5
-    PROMPT_LOOKUP_MAX = 15
-    MAX_MODEL_LEN = int(1e7)
-    DEVICE = current_platform.device_type
-
-    model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
-
-    speculative_config = SpeculativeConfig(
-        target_model_config=model_config,
-        target_parallel_config=ParallelConfig(),
-        method="ngram",
-        num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
-        prompt_lookup_max=PROMPT_LOOKUP_MAX,
-        prompt_lookup_min=PROMPT_LOOKUP_MIN,
-    )
-
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        cache_config=CacheConfig(),
-        speculative_config=speculative_config,
-        device_config=DeviceConfig(device=current_platform.device_type),
-        parallel_config=ParallelConfig(),
-        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
-    )
-
-    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
-    mock_pp_group = mock.MagicMock()
-    mock_pp_group.world_size = 1
-    with mock.patch(
-        "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
-    ):
-        runner = GPUModelRunner(vllm_config, DEVICE)
-
-        # hack max model len
-        runner.max_model_len = MAX_MODEL_LEN
-        runner.drafter.max_model_len = MAX_MODEL_LEN
-
-        dummy_input_batch = InputBatch(
-            max_num_reqs=args.num_req,
-            max_model_len=MAX_MODEL_LEN,
-            max_num_batched_tokens=args.num_req * args.num_token,
-            device=DEVICE,
-            pin_memory=False,
-            vocab_size=256000,
-            block_sizes=[16],
-        )
-        dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
-        dummy_input_batch.spec_decode_unsupported_reqs = ()
-        dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
-        dummy_input_batch.token_ids_cpu = np.random.randint(
-            0, 20, (args.num_req, args.num_token)
-        )
-
-        runner.input_batch = dummy_input_batch
-
-        sampled_token_ids = [[0]] * args.num_req
-
-        print("Starting benchmark")
-        # first run is warmup so ignore it
-        for _ in range(args.num_iteration):
-            start = time.time()
-            runner.drafter.propose(
-                sampled_token_ids,
-                dummy_input_batch.req_ids,
-                dummy_input_batch.num_tokens_no_spec,
-                dummy_input_batch.token_ids_cpu,
-                dummy_input_batch.spec_decode_unsupported_reqs,
-            )
-            end = time.time()
-            print(f"Iteration time (s): {end - start}")
-
-
 def invoke_main() -> None:
    parser = FlexibleArgumentParser(
        description="Benchmark the performance of N-gram speculative decode drafting"
    )
-    parser.add_argument(
-        "--batched", action="store_true", help="consider time to prepare batch"
-    )
    parser.add_argument(
        "--num-iteration",
        type=int,
@ -197,17 +105,8 @@ def invoke_main() -> None:
        help="Number of speculative tokens to generate",
    )
    args = parser.parse_args()
-
-    if not args.batched:
-        benchmark_propose(args)
-    else:
-        benchmark_batched_propose(args)
+    main(args)


-"""
-# Example command lines:
-# time python3 benchmarks/benchmark_ngram_proposer.py
-# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
-"""  # noqa: E501
 if __name__ == "__main__":
    invoke_main()  # pragma: no cover
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -32,12 +32,13 @@ import dataclasses
 import json
 import random
 import time
+from typing import Optional

 from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
 ) -> list[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@ -127,7 +128,7 @@ def sample_requests_from_random(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
    prefix_len: int,
 ) -> list[Request]:
    requests = []
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -7,11 +7,12 @@ import dataclasses
 import json
 import random
 import time
+from typing import Optional

 from transformers import AutoTokenizer, PreTrainedTokenizerBase

 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser


 # Select a equi-probable random priority
@ -23,7 +24,7 @@ def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
 ) -> list[tuple[str, int, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,19 +31,20 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
-from contextlib import nullcontext
 from dataclasses import dataclass
+from typing import Optional

 import datasets
 import numpy as np
 import pandas as pd
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
    RequestFuncInput,
    RequestFuncOutput,
 )
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase

 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
@ -51,7 +52,7 @@ except ImportError:
    from backend_request_func import get_tokenizer

 try:
-    from vllm.utils.argparse_utils import FlexibleArgumentParser
+    from vllm.utils import FlexibleArgumentParser
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

@ -316,7 +317,7 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: list[str],
    selected_percentiles: list[float],
-    goodput_config_dict: dict[str, float] | None = None,
+    goodput_config_dict: Optional[dict[str, float]] = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
    actual_output_lens: list[int] = []
    total_input = 0
@ -436,9 +437,9 @@ async def benchmark(
    selected_percentile_metrics: list[str],
    selected_percentiles: list[str],
    ignore_eos: bool,
-    max_concurrency: int | None,
+    max_concurrency: Optional[int],
    structured_output_ratio: float,
-    goodput_config_dict: dict[str, float] | None = None,
+    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -448,8 +449,7 @@ async def benchmark(
    def prepare_extra_body(request) -> dict:
        extra_body = {}
        # Add the schema to the extra_body
-        extra_body["structured_outputs"] = {}
-        extra_body["structured_outputs"][request.structure_type] = request.schema
+        extra_body[request.structure_type] = request.schema
        return extra_body

    print("Starting initial single prompt test run...")
@ -502,9 +502,15 @@ async def benchmark(

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None

    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)

@ -903,13 +909,13 @@ def create_argument_parser():
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any
+from typing import Any, Optional, Union


 def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
    def __init__(self, scale: int) -> None:
        self.cnt: int = 0
        self._sum: int = 0
-        self._max: int | None = None
+        self._max: Optional[int] = None
        self.scale = scale
        self.start_time: int = time.monotonic_ns()

@ -104,13 +104,13 @@ class TimeCollector:
        else:
            self._max = max(self._max, v)

-    def avg(self) -> float | str:
+    def avg(self) -> Union[float, str]:
        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"

-    def max(self) -> float | str:
+    def max(self) -> Union[float, str]:
        return self._max / self.scale if self._max else "N/A"

-    def dump_avg_max(self) -> list[float | str]:
+    def dump_avg_max(self) -> list[Union[float, str]]:
        return [self.avg(), self.max()]

    def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:

    def __exit__(
        self,
-        exc_type: type[BaseException] | None,
-        exc_value: BaseException | None,
-        exc_traceback: TracebackType | None,
+        exc_type: Optional[type[BaseException]],
+        exc_value: Optional[BaseException],
+        exc_traceback: Optional[TracebackType],
    ) -> None:
        self.collect(time.monotonic_ns() - self.start_time)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -6,7 +6,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
+from typing import Callable

 import torch
 import torch.utils.benchmark as TBenchmark
@ -15,7 +16,7 @@ from utils import make_rand_sparse_tensors
 from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -6,7 +6,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
+from typing import Callable, Optional

 import torch
 import torch.utils.benchmark as TBenchmark
@ -16,10 +17,9 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.math_utils import cdiv
+from vllm.utils import FlexibleArgumentParser, cdiv

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -53,7 +53,7 @@ def bench_int8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
@ -108,7 +108,7 @@ def bench_fp8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
@ -183,7 +183,7 @@ def bench(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -201,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
    dtype: torch.dtype,
    MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -55,7 +55,9 @@ benchmark() {
  output_len=$2


-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -38,12 +38,16 @@ wait_for_server() {
 launch_chunked_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
    --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -3,9 +3,10 @@

 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
+from typing import Callable, Optional

 import torch
 import torch.utils.benchmark as TBenchmark
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -67,7 +68,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -84,7 +85,7 @@ def unfused_fp8_impl(
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
--- a/benchmarks/kernels/bench_mxfp4_qutlass.py
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
@ -1,191 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-#
-# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import copy
-import itertools
-
-import torch
-from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
-from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
-from vllm.triton_utils import triton
-
-PROVIDER_CFGS = {
-    "torch-bf16": dict(enabled=True),
-    "mxfp4": dict(no_a_quant=False, enabled=True),
-    "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
-}
-
-_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
-
-
-def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
-    return (
-        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
-        * group_size**-0.5
-    )
-
-
-def _quant_weight_mxfp4(
-    b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
-):
-    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
-        b, forward_hadamard_matrix, method="abs_max"
-    )
-    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
-    return weight_hf_e2m1, weight_hf_scale_block
-
-
-def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
-    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
-        b, forward_hadamard_matrix, device
-    )
-    alpha = torch.tensor([1.0], device="cuda")
-
-    if cfg["no_a_quant"]:
-        # Pre-quantize activation
-        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
-            a, forward_hadamard_matrix, method="abs_max"
-        )
-        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
-
-        def run():
-            return matmul_mxf4_bf16_tn(
-                input_hf_e2m1,
-                weight_hf_e2m1,
-                input_hf_scale_block,
-                weight_hf_scale_block,
-                alpha,
-            )
-
-        return run
-
-    # Quantize activation on-the-fly
-    def run():
-        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
-            a, forward_hadamard_matrix, method="abs_max"
-        )
-        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
-        return matmul_mxf4_bf16_tn(
-            input_hf_e2m1,
-            weight_hf_e2m1,
-            input_hf_scale_block,
-            weight_hf_scale_block,
-            alpha,
-        )
-
-    return run
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["batch_size"],
-        x_vals=[
-            1,
-            4,
-            8,
-            16,
-            32,
-            64,
-            128,
-            256,
-            512,
-            1024,
-            2048,
-            4096,
-            8192,
-            16384,
-            24576,
-            32768,
-        ],
-        x_log=False,
-        line_arg="provider",
-        line_vals=_enabled,
-        line_names=_enabled,
-        ylabel="TFLOP/s (larger is better)",
-        plot_name="BF16 vs MXFP4 GEMMs",
-        args={},
-    )
-)
-def benchmark(batch_size, provider, N, K, had_size):
-    M = batch_size
-    device = "cuda"
-    dtype = torch.bfloat16
-
-    a = torch.randn((M, K), device=device, dtype=dtype)
-    b = torch.randn((N, K), device=device, dtype=dtype)
-    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
-
-    quantiles = [0.5, 0.2, 0.8]
-
-    if provider == "torch-bf16":
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
-        )
-    else:
-        cfg = PROVIDER_CFGS[provider]
-        run_quant = build_mxfp4_runner(
-            cfg, a, b, forward_hadamard_matrix, dtype, device
-        )
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: run_quant(), rep=200, quantiles=quantiles
-        )
-
-    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
-    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
-
-
-def prepare_shapes(args):
-    out = []
-    for model, tp_size in itertools.product(args.models, args.tp_sizes):
-        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
-            KN[tp_dim] //= tp_size
-            KN.append(model)
-            out.append(KN)
-    return out
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=["meta-llama/Llama-3.3-70B-Instruct"],
-        choices=list(WEIGHT_SHAPES.keys()),
-    )
-    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
-    args = parser.parse_args()
-
-    for K, N, model in prepare_shapes(args):
-        for had_size in [32, 64, 128]:
-            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
-            benchmark.run(
-                print_data=True,
-                show_plots=True,
-                save_path=f"bench_mxfp4_res_n{N}_k{K}",
-                N=N,
-                K=K,
-                had_size=had_size,
-            )
-
-    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@ -3,7 +3,6 @@
 import argparse
 import copy
 import itertools
-import os

 import torch
 from weight_shapes import WEIGHT_SHAPES
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "nvfp4": dict(no_a_quant=False, enabled=True),
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
-    "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
-    "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
 }

-_needs_fbgemm = any(
-    v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
-)
-if _needs_fbgemm:
-    try:
-        from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
-            triton_scale_nvfp4_quant,
-        )
-    except ImportError:
-        print(
-            "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
-            "These providers will be skipped. Please install fbgemm_gpu with: "
-            "'pip install fbgemm-gpu-genai' to run them."
-        )
-        # Disable FBGEMM providers so the benchmark can run.
-        for cfg in PROVIDER_CFGS.values():
-            if cfg.get("fbgemm"):
-                cfg["enabled"] = False
-
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]


-def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
+def _quant_weight_nvfp4(b: torch.Tensor, device: str):
    # Compute global scale for weight
    b_amax = torch.abs(b).max().to(torch.float32)
    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
-    if "fbgemm" in cfg and cfg["fbgemm"]:
-        b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
-    else:
-        b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
+    b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
    return b_fp4, scale_b_fp4, b_global_scale


 def build_nvfp4_runner(cfg, a, b, dtype, device):
-    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
+    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)

    # Compute global scale for activation
    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):

    # Alpha for the GEMM operation
    alpha = 1.0 / (a_global_scale * b_global_scale)
-    if "fbgemm" in cfg and cfg["fbgemm"]:
-        if cfg["no_a_quant"]:
-            a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
-
-            def run():
-                return torch.ops.fbgemm.f4f4bf16(
-                    a_fp4,
-                    b_fp4,
-                    scale_a_fp4,
-                    scale_b_fp4,
-                    global_scale=alpha,
-                    use_mx=False,
-                )
-
-            return run
-        else:
-
-            def run():
-                a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
-                return torch.ops.fbgemm.f4f4bf16(
-                    a_fp4,
-                    b_fp4,
-                    scale_a_fp4,
-                    scale_b_fp4,
-                    global_scale=alpha,
-                    use_mx=False,
-                )
-
-            return run

    if cfg["no_a_quant"]:
        # Pre-quantize activation
@ -184,13 +130,10 @@ if __name__ == "__main__":

    for K, N, model in prepare_shapes(args):
        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
-        save_dir = f"bench_nvfp4_res_n{N}_k{K}"
-        os.makedirs(save_dir, exist_ok=True)
-
        benchmark.run(
            print_data=True,
            show_plots=True,
-            save_path=save_dir,
+            save_path=f"bench_nvfp4_res_n{N}_k{K}",
            N=N,
            K=K,
        )
--- a/benchmarks/kernels/bench_nvfp4_qutlass.py
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
@ -1,207 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-#
-# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import copy
-import itertools
-
-import torch
-from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
-from vllm._custom_ops import fusedQuantizeNv
-from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
-from vllm.triton_utils import triton
-
-PROVIDER_CFGS = {
-    "torch-bf16": dict(enabled=True),
-    "nvfp4": dict(no_a_quant=False, enabled=True),
-    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
-}
-
-_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
-
-
-def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
-    return (
-        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
-        * group_size**-0.5
-    )
-
-
-def _quant_weight_nvfp4(
-    b: torch.Tensor,
-    forward_hadamard_matrix: torch.Tensor,
-    global_scale: torch.Tensor,
-    device: str,
-    M: int,
-    N: int,
-    K: int,
-):
-    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
-        b, forward_hadamard_matrix, global_scale
-    )
-    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
-        -1, K // 16
-    )
-    return weight_hf_e2m1, weight_hf_scale_block
-
-
-def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
-    alpha = torch.tensor([1.0], device="cuda")
-    global_scale = torch.tensor([1.0], device="cuda")
-    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
-        b, forward_hadamard_matrix, global_scale, device, M, N, K
-    )
-
-    if cfg["no_a_quant"]:
-        # Pre-quantize activation
-        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
-            a, forward_hadamard_matrix, global_scale
-        )
-        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-            -1, K // 16
-        )
-
-        def run():
-            return ops.cutlass_scaled_fp4_mm(
-                input_hf_e2m1,
-                weight_hf_e2m1,
-                input_hf_scale_block,
-                weight_hf_scale_block,
-                alpha,
-                torch.bfloat16,
-            )
-
-        return run
-
-    # Quantize activation on-the-fly
-    def run():
-        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
-            a, forward_hadamard_matrix, global_scale
-        )
-        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-            -1, K // 16
-        )
-        return ops.cutlass_scaled_fp4_mm(
-            input_hf_e2m1,
-            weight_hf_e2m1,
-            input_hf_scale_block,
-            weight_hf_scale_block,
-            alpha,
-            torch.bfloat16,
-        )
-
-    return run
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["batch_size"],
-        x_vals=[
-            1,
-            4,
-            8,
-            16,
-            32,
-            64,
-            128,
-            256,
-            512,
-            1024,
-            2048,
-            4096,
-            8192,
-            16384,
-            24576,
-            32768,
-        ],
-        x_log=False,
-        line_arg="provider",
-        line_vals=_enabled,
-        line_names=_enabled,
-        ylabel="TFLOP/s (larger is better)",
-        plot_name="BF16 vs NVFP4 GEMMs",
-        args={},
-    )
-)
-def benchmark(batch_size, provider, N, K, had_size):
-    M = batch_size
-    device = "cuda"
-    dtype = torch.bfloat16
-
-    a = torch.randn((M, K), device=device, dtype=dtype)
-    b = torch.randn((N, K), device=device, dtype=dtype)
-    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
-
-    quantiles = [0.5, 0.2, 0.8]
-
-    if provider == "torch-bf16":
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
-        )
-    else:
-        cfg = PROVIDER_CFGS[provider]
-        run_quant = build_nvfp4_runner(
-            cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
-        )
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: run_quant(), rep=200, quantiles=quantiles
-        )
-
-    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
-    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
-
-
-def prepare_shapes(args):
-    out = []
-    for model, tp_size in itertools.product(args.models, args.tp_sizes):
-        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
-            KN[tp_dim] //= tp_size
-            KN.append(model)
-            out.append(KN)
-    return out
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=["meta-llama/Llama-3.3-70B-Instruct"],
-        choices=list(WEIGHT_SHAPES.keys()),
-    )
-    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
-    args = parser.parse_args()
-
-    for K, N, model in prepare_shapes(args):
-        for had_size in [16, 32, 64, 128]:
-            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
-            benchmark.run(
-                print_data=True,
-                show_plots=True,
-                save_path=f"bench_nvfp4_res_n{N}_k{K}",
-                N=N,
-                K=K,
-                had_size=had_size,
-            )
-
-    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from collections.abc import Callable
+from typing import Callable
 from unittest.mock import patch

 import pandas as pd
@ -10,8 +10,7 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


 def with_triton_mode(fn):
@ -52,7 +51,7 @@ def calculate_diff(
 ):
    """Calculate the difference between Inductor and CUDA implementations."""
    device = torch.device("cuda")
-    x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
+    x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)

    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)

@ -60,25 +59,23 @@ def calculate_diff(
    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)

-    try:
-        torch.testing.assert_close(
-            cuda_out.to(torch.float32),
-            torch_out.to(torch.float32),
-            rtol=1e-3,
-            atol=1e-5,
-        )
-        torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
-        torch.testing.assert_close(
-            cuda_out.to(torch.float32),
-            torch_eager_out.to(torch.float32),
-            rtol=1e-3,
-            atol=1e-5,
-        )
-        torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
+    out_allclose = lambda o1, o2: torch.allclose(
+        o1.to(torch.float32),
+        o2.to(torch.float32),
+        rtol=1e-3,
+        atol=1e-5,
+    )
+    scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
+
+    if (
+        out_allclose(cuda_out, torch_out)
+        and scale_allclose(cuda_scale, torch_scale)
+        and out_allclose(cuda_out, torch_eager_out)
+        and scale_allclose(cuda_scale, torch_eager_scale)
+    ):
        print("✅ All implementations match")
-    except AssertionError as e:
+    else:
        print("❌ Implementations differ")
-        print(e)


 configs = []
@ -94,7 +91,7 @@ def benchmark_quantization(
 ):
    device = torch.device("cuda")

-    x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
+    x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]
    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
@ -160,21 +157,21 @@ if __name__ == "__main__":
    )
    parser.add_argument("-c", "--check", action="store_true")
    parser.add_argument(
-        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
    )
    parser.add_argument(
        "--hidden-sizes",
        type=int,
        nargs="+",
-        default=[896, 1024, 2048, 4096, 7168],
-        help="Hidden sizes to benchmark",
+        default=None,
+        help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
    )
    parser.add_argument(
        "--batch-sizes",
        type=int,
        nargs="+",
-        default=[1, 16, 128, 512, 1024],
-        help="Batch sizes to benchmark",
+        default=None,
+        help="Batch sizes to benchmark (default: 1,16,32,64,128)",
    )
    parser.add_argument(
        "--group-sizes",
@ -195,8 +192,8 @@ if __name__ == "__main__":

    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]

-    hidden_sizes = args.hidden_sizes
-    batch_sizes = args.batch_sizes
+    hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+    batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]

    if args.group_sizes is not None:
        group_shapes = []
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -10,8 +10,7 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser

 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -28,7 +28,7 @@ except ImportError as e:

 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target

-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 parser = FlexibleArgumentParser(
    description="Benchmark BitBLAS int4 on a specific target."
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 WEIGHT_SHAPES_MOE = {
    "nvidia/DeepSeek-R1-FP4": [
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@ -1,406 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
-kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
-but use different quantization strategies and backends.
-"""
-
-import nvtx
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
-from vllm.platforms import current_platform
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-# Weight shapes for different models: [num_experts, topk, hidden_size,
-# intermediate_size]
-WEIGHT_SHAPES_MOE = {
-    "mixtral-8x7b": [
-        [8, 2, 4096, 14336],
-    ],
-    "deepseek-v2": [
-        [160, 6, 5120, 12288],
-    ],
-    "custom-small": [
-        [8, 2, 2048, 7168],
-    ],
-    "glm45-fp8": [
-        [128, 8, 4096, 1408],
-    ],
-    "Llama-4-Maverick-17B-128E-Instruct-FP8": [
-        [128, 1, 5120, 8192],
-    ],
-}
-
-DEFAULT_MODELS = [
-    "mixtral-8x7b",
-]
-
-DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
-DEFAULT_TP_SIZES = [1]
-
-PER_ACT_TOKEN_OPTS = [False, True]
-PER_OUT_CH_OPTS = [False, True]
-
-FP8_DTYPE = current_platform.fp8_dtype()
-
-
-def bench_run(
-    results: list,
-    model: str,
-    num_experts: int,
-    topk: int,
-    per_act_token: bool,
-    per_out_ch: bool,
-    mkn: tuple[int, int, int],
-):
-    (m, k, n) = mkn
-
-    dtype = torch.half
-    device = "cuda"
-
-    # Create input activations
-    a = torch.randn((m, k), device=device, dtype=dtype) / 10
-
-    # Create weights
-    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
-    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
-
-    # Create FP8 quantized weights and scales for both kernels
-    w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
-    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
-
-    # Create scales based on quantization strategy
-    if per_out_ch:
-        # Per-channel quantization
-        w1_scale = torch.empty(
-            (num_experts, 2 * n, 1), device=device, dtype=torch.float32
-        )
-        w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
-    else:
-        # Per-tensor quantization
-        w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
-        w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
-
-    # Quantize weights
-    for expert in range(num_experts):
-        if per_out_ch:
-            # Per-channel quantization - not yet implemented properly
-            # For now, fall back to per-tensor quantization
-            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
-            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
-            # Expand scalar scales to the expected per-channel shape
-            w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
-            w2_scale[expert] = w2_scale_temp.expand(k, 1)
-        else:
-            # Per-tensor quantization
-            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
-            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
-            # Store scalar scales in [1, 1] tensors
-            w1_scale[expert, 0, 0] = w1_scale_temp
-            w2_scale[expert, 0, 0] = w2_scale_temp
-
-    # Prepare weights for CUTLASS (no transpose needed)
-    w1_fp8q_cutlass = w1_fp8q  # Keep original [E, 2N, K]
-    w2_fp8q_cutlass = w2_fp8q  # Keep original [E, K, N]
-
-    # Create router scores and get topk
-    score = torch.randn((m, num_experts), device=device, dtype=dtype)
-    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
-
-    # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
-    # Force per-tensor quantization for all cases to match working e2e setup
-    a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
-    a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
-
-    # Force per-tensor quantization for all cases
-    per_act_token = False
-
-    # Create stride tensors for CUTLASS
-    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
-    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
-    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-
-    def run_triton_moe(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            fused_experts(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-
-    def run_cutlass_moe_fp8(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
-                cutlass_moe_fp8(
-                    a=a,
-                    w1_q=w1,
-                    w2_q=w2,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    ab_strides1=ab_strides1,
-                    ab_strides2=ab_strides2,
-                    c_strides1=c_strides1,
-                    c_strides2=c_strides2,
-                    quant_config=quant_config,
-                    activation="silu",
-                    global_num_experts=num_experts,
-                )
-
-    # Pre-create quantization config to avoid creating it inside CUDA graph
-    quant_config = fp8_w8a8_moe_quant_config(
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
-        a1_scale=a1_scale,
-        a2_scale=a2_scale,
-        per_act_token_quant=per_act_token,
-        per_out_ch_quant=per_out_ch,
-    )
-
-    # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
-    cutlass_stream = torch.cuda.Stream()
-    cutlass_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
-        # Capture 10 invocations like benchmark_moe.py
-        for _ in range(10):
-            cutlass_moe_fp8(
-                a=a,
-                w1_q=w1_fp8q_cutlass,
-                w2_q=w2_fp8q_cutlass,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                ab_strides1=ab_strides1,
-                ab_strides2=ab_strides2,
-                c_strides1=c_strides1,
-                c_strides2=c_strides2,
-                quant_config=quant_config,
-                activation="silu",
-                global_num_experts=num_experts,
-            )
-    torch.cuda.synchronize()
-
-    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
-    triton_stream = torch.cuda.Stream()
-    triton_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(triton_graph, stream=triton_stream):
-        # Capture 10 invocations like benchmark_moe.py
-        for _ in range(10):
-            fused_experts(
-                a,
-                w1_fp8q,
-                w2_fp8q,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-    torch.cuda.synchronize()
-
-    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
-        """Benchmark CUDA graph using events like benchmark_moe.py"""
-        # Warmup
-        for _ in range(num_warmup):
-            graph.replay()
-        torch.cuda.synchronize()
-
-        # Timing
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-
-        latencies = []
-        for _ in range(num_iters):
-            torch.cuda.synchronize()
-            start_event.record()
-            graph.replay()
-            end_event.record()
-            end_event.synchronize()
-            latencies.append(start_event.elapsed_time(end_event))
-
-        # Divide by 10 since graph contains 10 calls
-        return sum(latencies) / (num_iters * 10)
-
-    # Benchmark parameters
-    num_warmup = 5
-    num_iters = 100
-
-    # Benchmark only CUDA graphs (more reliable and faster)
-    # Benchmark Triton MoE with CUDA graphs
-    triton_graph_time = bench_cuda_graph(
-        triton_graph, num_warmup=num_warmup, num_iters=num_iters
-    )
-
-    # Benchmark CUTLASS MoE with CUDA graphs
-    cutlass_graph_time = bench_cuda_graph(
-        cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
-    )
-
-    # Convert ms to us and return results
-    triton_time_us = triton_graph_time * 1000
-    cutlass_time_us = cutlass_graph_time * 1000
-
-    return {
-        "batch_size": m,
-        "triton_time_us": triton_time_us,
-        "cutlass_time_us": cutlass_time_us,
-    }
-
-
-def main(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    all_results = []
-
-    for model in args.models:
-        for tp in args.tp_sizes:
-            for layer in WEIGHT_SHAPES_MOE[model]:
-                num_experts = layer[0]
-                topk = layer[1]
-                size_k = layer[2]
-                size_n = layer[3] // tp
-
-                if len(args.limit_k) > 0 and size_k not in args.limit_k:
-                    continue
-
-                if len(args.limit_n) > 0 and size_n not in args.limit_n:
-                    continue
-
-                for per_act_token in args.per_act_token_opts:
-                    for per_out_ch in args.per_out_ch_opts:
-                        print(
-                            f"\n=== {model}, experts={num_experts}, topk={topk},"
-                            f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
-                        )
-
-                        config_results = []
-                        for size_m in args.batch_sizes:
-                            mkn = (size_m, size_k, size_n)
-                            result = bench_run(
-                                [],  # Not used anymore
-                                model,
-                                num_experts,
-                                topk,
-                                per_act_token,
-                                per_out_ch,
-                                mkn,
-                            )
-                            if result:
-                                config_results.append(result)
-
-                        # Print results table for this configuration
-                        if config_results:
-                            print(
-                                f"\n{'Batch Size':<12}"
-                                f"{'Triton (us)':<15}"
-                                f"{'CUTLASS (us)':<15}"
-                            )
-                            print("-" * 45)
-                            for result in config_results:
-                                print(
-                                    f"{result['batch_size']:<12}"
-                                    f"{result['triton_time_us']:<15.2f}"
-                                    f"{result['cutlass_time_us']:<15.2f}"
-                                )
-
-                            all_results.extend(config_results)
-
-    print(f"\nTotal benchmarks completed: {len(all_results)}")
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
-         across specified models/shapes/batches
-
-        Example usage:
-        python benchmark_cutlass_moe_fp8.py  \
-            --model "Llama-4-Maverick-17B-128E-Instruct-FP8"  \
-            --tp-sizes 8 \
-            --batch-size 2 4 8  \
-            --per-act-token-opts false \
-            --per-out-ch-opts false
-
-        """
-    )
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=DEFAULT_MODELS,
-        choices=WEIGHT_SHAPES_MOE.keys(),
-    )
-    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
-    parser.add_argument(
-        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
-    )
-    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
-    parser.add_argument(
-        "--per-act-token-opts",
-        nargs="+",
-        type=lambda x: x.lower() == "true",
-        default=[False, True],
-        help="Per-activation token quantization options (true/false)",
-    )
-    parser.add_argument(
-        "--per-out-ch-opts",
-        nargs="+",
-        type=lambda x: x.lower() == "true",
-        default=[False, True],
-        help="Per-output channel quantization options (true/false)",
-    )
-
-    args = parser.parse_args()
-    main(args)
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -7,10 +7,6 @@ Benchmark script for device communicators:
 CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 and SymmMemCommunicator (multimem, two-shot).

-for NCCL symmetric memory you need to set the environment variables
-NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
-not use fast NVLS implementation for all reduce.
-
 Usage:
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]

@ -22,24 +18,18 @@ Example:
 import json
 import os
 import time
-from collections.abc import Callable
 from contextlib import nullcontext
+from typing import Callable, Optional

 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup

 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
-from vllm.distributed.device_communicators.pynccl import (
-    PyNcclCommunicator,
-    register_nccl_symmetric_ops,
-)
-from vllm.distributed.device_communicators.pynccl_allocator import (
-    set_graph_pool_id,
-)
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 logger = init_logger(__name__)

@ -108,7 +98,6 @@ class CommunicatorBenchmark:
            )
            if not self.pynccl_comm.disabled:
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
-                register_nccl_symmetric_ops(self.pynccl_comm)
            else:
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
                self.pynccl_comm = None
@ -205,15 +194,6 @@ class CommunicatorBenchmark:
                    None,  # no env variable needed
                )
            )
-            communicators.append(
-                (
-                    "pynccl-symm",
-                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
-                    lambda t: True,  # Always available if initialized
-                    nullcontext(),
-                    None,  # no env variable needed
-                )
-            )

        if self.symm_mem_comm_multimem is not None:
            comm = self.symm_mem_comm_multimem
@ -264,12 +244,12 @@ class CommunicatorBenchmark:
    def benchmark_allreduce_single(
        self,
        sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
+        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
        should_use_fn: Callable[[torch.Tensor], bool],
        context,
        num_warmup: int,
        num_trials: int,
-    ) -> float | None:
+    ) -> Optional[float]:
        """Benchmark method with CUDA graph optimization."""
        try:
            # Create test tensor (2D: sequence_length x hidden_size)
@ -291,9 +271,7 @@ class CommunicatorBenchmark:
                # Capture the graph using context manager
                with context:
                    graph = torch.cuda.CUDAGraph()
-                    graph_pool = torch.cuda.graph_pool_handle()
-                    set_graph_pool_id(graph_pool)
-                    with torch.cuda.graph(graph, pool=graph_pool):
+                    with torch.cuda.graph(graph):
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    fused_topk,
 )
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = [
    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -7,8 +7,7 @@ import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -6,12 +6,11 @@ import copy
 import json
 import pickle
 import time
-from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable, Optional

 import torch
 import torch.utils.benchmark as TBenchmark
@ -25,7 +24,7 @@ if HAS_TRITON:
    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT

-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_TP_SIZES = [1]
@ -80,9 +79,9 @@ def make_rand_lora_weight_tensor(


 def make_rand_tensors(
-    a_shape: tuple[int, ...],
-    b_shape: tuple[int, ...],
-    c_shape: tuple[int, ...],
+    a_shape: tuple[int],
+    b_shape: tuple[int],
+    c_shape: tuple[int],
    a_dtype: torch.dtype,
    b_dtype: torch.dtype,
    c_dtype: torch.dtype,
@ -159,7 +158,7 @@ def ref_group_gemm(
    seq_lens_cpu: torch.Tensor,
    prompt_lora_mapping_cpu: torch.Tensor,
    scaling: float,
-    add_inputs: bool | None,
+    add_inputs: Optional[bool],
 ):
    """
    Torch group gemm reference implementation to test correctness of
@ -244,7 +243,7 @@ class OpType(Enum):
        lora_rank: int,
        num_loras: int,
        num_slices: int,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+    ) -> tuple[tuple[int], tuple[int], tuple[int]]:
        """
        Given num_slices, return the shapes of the A, B, and C matrices
        in A x B = C, for the op_type
@ -317,8 +316,8 @@ class BenchmarkContext:
    lora_rank: int
    sort_by_lora_id: bool
    dtype: torch.dtype
-    seq_length: int | None = None
-    num_slices: int | None = None  # num_slices for slice based ops
+    seq_length: Optional[int] = None
+    num_slices: Optional[int] = None  # num_slices for slice based ops

    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
        ctx = copy.copy(self)
@ -562,7 +561,7 @@ class BenchmarkTensors:
        }

    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: bool | None = None
+        self, op_type: OpType, add_inputs: Optional[bool] = None
    ) -> dict[str, Any]:
        if op_type.is_shrink_fn():
            assert add_inputs is None
@ -576,7 +575,7 @@ class BenchmarkTensors:
        raise ValueError(f"Unrecognized optype {self}")

    def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: bool | None
+        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
    ) -> bool:
        """
        Test correctness of op_type implementation against a grouped gemm
@ -612,8 +611,8 @@ def bench_optype(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: int | None = None,
-    expand_fn_add_inputs: bool | None = None,
+    cuda_graph_nops: Optional[int] = None,
+    expand_fn_add_inputs: Optional[bool] = None,
    test_correctness: bool = False,
 ) -> TMeasurement:
    assert arg_pool_size >= 1
@ -680,7 +679,7 @@ def bench_torch_mm(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: int | None = None,
+    cuda_graph_nops: Optional[int] = None,
 ) -> TMeasurement:
    """
    Benchmark basic torch.mm as a roofline.
@ -745,7 +744,7 @@ def use_cuda_graph_recommendation() -> str:
            """


-def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
+def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
    compare = TBenchmark.Compare(timers)
    compare.print()

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -8,9 +8,10 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
+from typing import Callable, Optional

 import pandas as pd
 import torch
@ -33,7 +34,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
@ -62,23 +63,23 @@ class BenchmarkTensors:
    a: torch.Tensor

    w_q: torch.Tensor
-    group_size: int | None
+    group_size: Optional[int]
    wtype: ScalarType
    w_g_s: torch.Tensor
-    w_g_zp: torch.Tensor | None
-    w_ch_s: torch.Tensor | None
-    w_tok_s: torch.Tensor | None
+    w_g_zp: Optional[torch.Tensor]
+    w_ch_s: Optional[torch.Tensor]
+    w_tok_s: Optional[torch.Tensor]


@dataclass
 class TypeConfig:
    act_type: torch.dtype
    weight_type: ScalarType
-    output_type: torch.dtype | None
-    group_scale_type: torch.dtype | None
-    group_zero_type: torch.dtype | None
-    channel_scale_type: torch.dtype | None
-    token_scale_type: torch.dtype | None
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    group_zero_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]


 def rand_data(shape, dtype=torch.float16, scale=1):
@ -92,8 +93,8 @@ def quantize_and_pack(
    atype: torch.dtype,
    w: torch.Tensor,
    wtype: ScalarType,
-    stype: torch.dtype | None,
-    group_size: int | None,
+    stype: Optional[torch.dtype],
+    group_size: Optional[int],
    zero_points: bool = False,
 ):
    assert wtype.is_integer(), "TODO: support floating point weights"
@ -112,7 +113,7 @@ def quantize_and_pack(


 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
+    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
 ) -> list[BenchmarkTensors]:
    m, n, k = shape

@ -330,8 +331,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
    return res


-_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
-_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None


 def bench(
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Woosuk Kwon	936da0f740	update Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>	2025-09-19 23:30:15 +00:00
Woosuk Kwon	20098c10d9	Remove global CUDA graph pool Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>	2025-09-19 23:27:51 +00:00
				`@ -1 +0,0 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml`