Fix typo in ValueError message: use kv_role instead of kv_disagg_role (#27166 )

Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
[Bugfix] Fix error with penalties when speculative decoding and structural output are enabled (#26586 )
2025-10-20 14:53:52 +08:00 · 2025-10-19 19:47:19 +00:00 · 2025-10-19 19:24:46 +00:00 · 2025-10-19 05:20:55 -07:00 · 2025-10-19 03:06:32 -07:00 · 2025-10-19 08:10:07 +00:00
2091 changed files with 209913 additions and 157772 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+    value: 0.80
+limit: 100
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -0,0 +1,10 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+tasks:
+- name: "mmlu_pro"
+  metrics:
+  - name: "exact_match,custom-extract"
+    value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    value: 0.855
+limit: 2500
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.9
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on ChartQA using multimodal vllm."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our correctness tests in vllm's CI."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:l:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    t ) 
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm-vlm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+  --tasks chartqa \
+  --batch_size auto \
+  --apply_chat_template \
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m )
+        MODEL="$OPTARG"
+        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l )
+        LIMIT="$OPTARG"
+        ;;
+    f )
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -19,21 +19,27 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
+    batch_size = eval_config.get("batch_size", "auto")
+    backend = eval_config.get("backend", "vllm")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"max_model_len={max_model_len},"
    )
    results = lm_eval.simple_evaluate(
-        model="vllm",
+        model=backend,
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto",
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        # text models. however, this is regressing measured strict-match for
+        # existing text models in CI, so only apply it for mm.
+        apply_chat_template=backend == "vllm-vlm",
+        batch_size=batch_size,
    )
    return results

--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -368,7 +368,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
        )

    # get markdown tables
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -181,18 +181,14 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
        -tp $tp \
-        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
        -tp $tp \
-        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -365,8 +365,7 @@ run_serving_tests() {
      continue
    fi

-    server_command="$server_envs python3 \
-      -m vllm.entrypoints.openai.api_server \
+    server_command="$server_envs vllm serve \
      $server_args"

    # run the server
@ -455,11 +454,6 @@ main() {
  fi
  check_hf_token

-  # Set to v1 to run v1 benchmark
-  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
-    export VLLM_USE_V1=1
-  fi
-
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -1,46 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    "I",
-    # flake8-logging-format
-    "G",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-    # f-string format
-    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-]
-
-[tool.ruff.format]
-docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -8,7 +8,7 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -48,7 +48,7 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"

  # Add job to create multi-arch manifest
@ -150,11 +150,16 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      - "docker push vllm/vllm-openai:nightly"
-      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-aarch64"
+      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest push vllm/vllm-openai:nightly"
+      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
@ -163,3 +168,4 @@ steps:
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
+      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -8,20 +8,41 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"

-# Get DockerHub token from environment
+# Get DockerHub credentials from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi

+if [ -z "$DOCKERHUB_USERNAME" ]; then
+    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
+    exit 1
+fi
+
+# Get DockerHub bearer token
+echo "Getting DockerHub bearer token..."
+set +x
+BEARER_TOKEN=$(curl -s -X POST \
+    -H "Content-Type: application/json" \
+    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
+    "https://hub.docker.com/v2/users/login" | jq -r '.token')
+set -x
+
+if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
+    echo "Error: Failed to get DockerHub bearer token"
+    exit 1
+fi
+
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    
    while true; do
-        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+        set +x
+        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
+        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -43,7 +64,9 @@ delete_tag() {
    echo "Deleting tag: $tag_name"
    
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+    set +x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
+    set -x
    
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
-  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -25,25 +25,28 @@ function cpu_tests() {

  # offline inference
  podman exec -it "$container_id" bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    set -xve
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
-    set -e
+    set -evx
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+
+    # Note: disable Bart until supports V1
+    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }

 # All of CPU tests are expected to be finished less than 40 mins.

 export container_id
 export -f cpu_tests
-timeout 40m bash -c cpu_tests
+timeout 120m bash -c cpu_tests

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -58,11 +58,8 @@ function cpu_tests() {
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

-    # Note: disable Bart until supports V1
-    pytest -x -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
+    pytest -x -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model

    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
@ -73,7 +70,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"

  # Note: disable it until supports V1
  # Run AWQ test
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -0,0 +1,191 @@
+#!/bin/bash
+
+# This script build the Ascend NPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Base ubuntu image with basic ascend development libraries and python installed
+VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
+CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
+TEST_RUN_CONFIG_FILE="vllm_test.cfg"
+VLLM_ASCEND_TMP_DIR=
+# Get the test run configuration file from the vllm-ascend repository
+fetch_vllm_test_cfg() {
+    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
+    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
+    cleanup() {
+        rm -rf "${VLLM_ASCEND_TMP_DIR}"
+    }
+    trap cleanup EXIT
+
+    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
+    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
+        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
+        exit 1
+    fi
+
+    # If the file already exists locally, just overwrite it
+    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
+    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
+
+    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
+    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
+    rm -rf "${VLLM_ASCEND_TMP_DIR}"
+    trap - EXIT
+}
+
+# Downloads test run configuration file from a remote URL.
+# Loads the configuration into the current script environment.
+get_config() {
+    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
+        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
+        exit 1
+    fi
+    source "${TEST_RUN_CONFIG_FILE}"
+    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
+    return 0
+}
+
+# get test running configuration.
+fetch_vllm_test_cfg
+get_config
+# Check if the function call was successful. If not, exit the script.
+if [ $? -ne 0 ]; then
+  exit 1
+fi
+
+image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
+container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
+agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
+echo "agent_idx: ${agent_idx}"
+builder_name="cachebuilder${agent_idx}"
+builder_cache_dir="/mnt/docker-cache${agent_idx}"
+mkdir -p ${builder_cache_dir}
+
+# Try building the docker image
+cat <<EOF | DOCKER_BUILDKIT=1 docker build \
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+    --progress=plain --load -t ${image_name} -f - .
+FROM ${BASE_IMAGE_NAME}
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
+    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
+    apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install for pytest to make the docker build cache layer always valid
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install pytest>=6.0  modelscope
+
+WORKDIR /workspace/vllm
+
+# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements/common.txt
+
+COPY . .
+
+# Install vLLM
+RUN --mount=type=cache,target=/root/.cache/pip \
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton
+
+# Install vllm-ascend
+WORKDIR /workspace
+ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
+ARG VLLM_ASCEND_TAG=main
+RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
+    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
+
+# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r /workspace/vllm-ascend/requirements.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV VLLM_USE_MODELSCOPE=True
+
+WORKDIR /workspace/vllm-ascend
+
+CMD ["/bin/bash"]
+
+EOF
+
+# Setup cleanup
+remove_docker_container() {
+  docker rm -f "${container_name}" || true;
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
+}
+trap remove_docker_container EXIT
+
+# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
+# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
+#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
+#   returns --device /dev/davinci0 --device /dev/davinci1
+parse_and_gen_devices() {
+    local input="$1"
+    local index cards_num
+    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
+        index="${BASH_REMATCH[1]}"
+        cards_num="${BASH_REMATCH[2]}"
+    else
+        echo "parse error" >&2
+        return 1
+    fi
+
+    local devices=""
+    local i=0
+    while (( i < cards_num )); do
+        local dev_idx=$(((index - 1)*cards_num + i ))
+        devices="$devices --device /dev/davinci${dev_idx}"
+        ((i++))
+    done
+
+    # trim leading space
+    devices="${devices#"${devices%%[![:space:]]*}"}"
+    # Output devices: assigned to the caller variable
+    printf '%s' "$devices"
+}
+
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+
+# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
+# This test checks whether the OOT platform interface is functioning properly in conjunction with
+# the hardware plugin vllm-ascend.
+model_cache_dir=/mnt/modelscope${agent_idx}
+mkdir -p ${model_cache_dir}
+docker run \
+    ${devices} \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v ${model_cache_dir}:/root/.cache/modelscope \
+    --entrypoint="" \
+    --name "${container_name}" \
+    "${image_name}" \
+    bash -c '
+    set -e
+    pytest -v -s tests/e2e/vllm_interface/
+'
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer
+    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"

 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer
+    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"

 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -35,16 +35,14 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_serial_utils.py
-    pytest -v -s v1/test_utils.py
-    pytest -v -s v1/test_metrics_reader.py
 '
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?

 # run server-based benchmarks and upload the result to buildkite
-python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+vllm serve meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+    echo "Removing existing Prime-RL directory..."
+    rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+    echo "Installing UV package manager..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=10.0
+EXPECTED_THROUGHPUT=8.7
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -42,7 +42,7 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo

-VLLM_USE_V1=1 vllm serve $MODEL \
+vllm serve $MODEL \
 --seed 42 \
 --max-num-seqs $MAX_NUM_SEQS \
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -6,24 +6,28 @@
 # to generate the final pipeline yaml file.

 # Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
 # command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-#     in this case, commands must be specified. the first command runs on first host, the second
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on the first host, the second
 #     command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.

 # When adding a test
-# - If the test belong to an existing group, add it there
+# - If the test belongs to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
@ -46,19 +50,28 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
-  - tests/utils_
  - tests/standalone_tests/lazy_imports.py
  - tests/transformers_utils
+  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s multimodal
-  - pytest -v -s utils_ # Utils
-  - pytest -v -s transformers_utils # transformers_utils
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s transformers_utils

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@ -110,7 +123,7 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
@ -148,7 +161,6 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
-  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
@ -156,28 +168,34 @@ steps:
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
-  - tests/v1/test_external_lb_dp.py
-  - tests/v1/test_internal_lb_dp.py
-  - tests/v1/test_hybrid_lb_dp.py
+  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
  commands:
-  # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with tp=2 and pp=2
+  # test with torchrun tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@ -278,24 +296,35 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
-    - pytest -v -s v1/core
+    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
-    - pytest -v -s v1/kv_connector/unit
-    - pytest -v -s v1/metrics
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s v1/test_utils.py
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_metrics_reader.py
+    - pytest -v -s v1/test_request.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

+- label: V1 Test others (CPU) # 5 mins
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+
 - label: Examples Test # 30min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
@ -314,12 +343,13 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -368,12 +398,12 @@ steps:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_fusion_attn.py
+    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
-    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
+    - pytest -v -s compile/test_aot_compile.py

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -386,8 +416,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/

- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 22min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -395,6 +425,7 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
+  - pytest -v -s compile/test_fusions_e2e.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@ -402,8 +433,9 @@ steps:
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
  commands:
-    - pytest -v -s kernels/core
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py

 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
@ -447,32 +479,22 @@ steps:
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba

- label: Tensorizer Test # 14min
-  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/model_executor/model_loader
-  - tests/tensorizer_loader
-  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
-  commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s tensorizer_loader
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-
- label: Model Executor Test # 7min
-  timeout_in_minutes: 20
+- label: Model Executor Test # 23min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor
  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

 - label: Benchmarks # 11min
  timeout_in_minutes: 20
@ -506,8 +528,9 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -535,10 +558,17 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/tool_use
-    - tests/mistral_tool_use
  commands:
-    - pytest -v -s tool_use
-    - pytest -v -s mistral_tool_use
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use

 #####  models test  #####

@ -578,13 +608,19 @@ steps:
  - vllm/
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
+  no_gpu: true
  commands:
-    - pytest -v -s models/test_transformers.py \
-                   models/test_registry.py \
-                   models/test_utils.py \
-                   models/test_vision.py
+    - pytest -v -s models/test_utils.py models/test_vision.py

 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
@ -699,6 +735,16 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@ -754,14 +800,16 @@ steps:
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -774,8 +822,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -792,19 +838,38 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

- label: GPT-OSS Eval (Blackwell)
+- label: Blackwell Fusion Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py
+
+- label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
-  optional: true # disable while debugging
+  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
@ -812,7 +877,34 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -856,47 +948,58 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

- label: Distributed Tests (2 GPUs) # 110min
-  timeout_in_minutes: 150
+- label: Distributed Tests (2 GPUs) # 68min
+  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
+  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  - vllm/compilation
  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/model_runner.py
-  - entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
-  - tests/v1/test_external_lb_dp.py
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
-  # test sequence parallel
-  - pytest -v -s distributed/test_sequence_parallel.py
-  # this test fails consistently.
-  # TODO: investigate and fix
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-  - pytest -v -s models/multimodal/generation/test_maverick.py

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
@ -917,6 +1020,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -981,6 +1089,17 @@ steps:
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+  
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh


 ##### multi gpus test #####
@ -1013,12 +1132,16 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
+    - pytest -v -s tests/compile/test_async_tp.py
+    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048

@ -1030,3 +1153,16 @@ steps:
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.coveragerc
+++ b/.coveragerc
@ -1,5 +1,10 @@
 [run]
-source = vllm
+# Track the installed vllm package (this is what actually gets imported during tests)
+# Use wildcard pattern to match the installed location
+source =
+    vllm
+    */dist-packages/vllm
+    */site-packages/vllm
 omit =
    */tests/*
    */test_*
@ -12,6 +17,16 @@ omit =
    */benchmarks/*
    */docs/*

+[paths]
+# Map all possible vllm locations to a canonical "vllm" path
+# This ensures coverage.combine properly merges data from different test runs
+source =
+    vllm
+    /vllm-workspace/src/vllm
+    /vllm-workspace/vllm
+    */site-packages/vllm
+    */dist-packages/vllm
+
 [report]
 exclude_lines =
    pragma: no cover
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,4 @@
+# Migrate from `yapf` & `isort` to `ruff`
+d6953beb91da4e9c99be4c0a1304a2d24189535c
+# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
+8fcaaf6a165e661f63fc51be906bc05b0767332f
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -4,19 +4,12 @@
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/fused_moe @mgoin
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
-/vllm/v1/attention @LucasWilkinson
-/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@ -28,14 +21,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
-/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/sample @22quinn @houseroad @njhill
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC

@ -57,27 +52,35 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
-/tests/v1/kv_connector/nixl_integration @NickLucche 
+/tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC

 # Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor

 # Docs
-/docs @hmellor
+/docs/mkdocs @hmellor
+/docs/**/*.yml @hmellor
+/requirements/docs.txt @hmellor
+.readthedocs.yaml @hmellor
 mkdocs.yaml @hmellor

+# Linting
+.markdownlint.yaml @hmellor
+.pre-commit-config.yaml @hmellor
+/tools/pre_commit @hmellor
+
 # CPU
-/vllm/v1/worker/^cpu @bigPYJ1151
+/vllm/v1/worker/cpu* @bigPYJ1151
 /csrc/cpu @bigPYJ1151
 /vllm/platforms/cpu.py @bigPYJ1151
 /cmake/cpu_extension.cmake @bigPYJ1151
 /docker/Dockerfile.cpu @bigPYJ1151

 # Intel GPU
-/vllm/v1/worker/^xpu @jikunshang
+/vllm/v1/worker/xpu* @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang

@ -115,3 +118,11 @@ mkdocs.yaml @hmellor

 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
+
+# Pooling models
+/examples/*/pooling/ @noooop
+/tests/models/*/pooling* @noooop
+/tests/entrypoints/pooling @noooop
+/vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
+/vllm/model_executor/layers/pooler.py @noooop
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -43,10 +43,6 @@ body:
      Any other things you would like to mention.
  validations:
    required: false
- type: markdown
-  attributes:
-    value: >
-      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -2,6 +2,7 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
+    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@ -10,10 +11,13 @@ pull_request_rules:
    label:
      add:
        - documentation
+    comment:
+      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"

 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
+    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@ -30,6 +34,7 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@ -46,6 +51,7 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
+    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@ -55,6 +61,7 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@ -70,6 +77,7 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
+    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@ -83,6 +91,7 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
+    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@ -94,6 +103,7 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
+    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
@ -107,6 +117,7 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@ -121,6 +132,7 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
@ -142,6 +154,7 @@ pull_request_rules:
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
+    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@ -162,6 +175,7 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
+    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@ -181,6 +195,7 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
+    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@ -196,6 +211,7 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
+    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@ -208,6 +224,7 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
+    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@ -223,6 +240,7 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
+    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@ -237,9 +255,9 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
+    - label != stale
    - or:
      - files~=^tests/tool_use/
-      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
@ -256,8 +274,9 @@ pull_request_rules:

 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-      - conflict
-      - -closed
+    - label != stale
+    - conflict
+    - -closed
  actions:
    label:
      add:
@ -271,10 +290,12 @@ pull_request_rules:

 - name: assign reviewer for tensorizer changes
  conditions:
+    - label != stale
+    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
-      - files~=^tests/tensorizer_loader/
+      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
      users:
@ -282,6 +303,7 @@ pull_request_rules:

 - name: assign reviewer for modelopt changes
  conditions:
+    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -296,8 +318,8 @@ pull_request_rules:

 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-      - -conflict
-      - -closed
+    - -conflict
+    - -closed
  actions:
    label:
      remove:
@ -306,6 +328,7 @@ pull_request_rules:
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/online_serving/disaggregated[^/]*/.*
      - files~=^examples/offline_inference/disaggregated[^/]*/.*
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,6 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
+        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@ -42,7 +43,6 @@ jobs:
                    searchIn: "body"
                  },
                ],
-                
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@ -89,14 +89,12 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
-                  
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
-                
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@ -107,13 +105,17 @@ jobs:
                  }
                ],
              },
+              // Add more label configurations here as needed
+              // example: {
+              //   keywords: [...],
+              //   substrings: [...],
+              //   regexPatterns: [...]
+              // },
            };
-            
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-              
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@ -125,16 +127,13 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
-            
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
-              
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
-                
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@ -146,21 +145,17 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
-                
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
-                
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
-                
                const termMatches = [];
-                
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@ -175,15 +170,14 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ? 
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                        context: line.length > 100 ?
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
                          : line.trim()
                      });
                    });
                  }
                });
-                
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@ -196,64 +190,48 @@ jobs:
                  });
                }
              }
-              
              return matches;
            }
-            
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
-              
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
-              
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
-              
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
-              
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
-              
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
-                
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
-                
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
-              
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
-                
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
-                
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
-              
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
-                
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
-                  
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
-                  
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -266,7 +244,6 @@ jobs:
                    }
                  });
                }
-                
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -274,13 +251,10 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
-                
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
-              
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
-              
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@ -296,14 +270,92 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
-              
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
-            
            // Process all configured labels
-            const processLabels = Object.entries(labelConfig)
-              .map(([labelName, config]) => processLabel(labelName, config));
-            const labelsAdded = await Promise.all(processLabels);
-            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            const labelsAddedResults = await Promise.all(
+              Object.entries(labelConfig).map(([labelName, config]) => 
+                processLabel(labelName, config).then(added => ({ labelName, added }))
+              )
+            );
+            
+            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            
+            // Return which labels were added for the next step
+            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
+            core.setOutput('labels_added', JSON.stringify(addedLabels));
+            return addedLabels;
+
+      - name: CC users for labeled issues
+        if: steps.label-step.outputs.labels_added != '[]'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
+        with:
+          script: |
+            // Configuration: Map labels to GitHub users to CC
+            // You can add multiple users per label, and multiple label configurations
+            const ccConfig = {
+              rocm: {
+                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
+                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
+              },
+              // Add more label -> user mappings here
+              // Example:
+              // cuda: {
+              //   users: ['user1', 'user2'],
+              //   message: 'CC {users} for CUDA-related issue'
+              // },
+              // performance: {
+              //   users: ['perfexpert'],
+              //   message: 'CC {users} for performance issue'
+              // },
+            };
+            
+            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
+            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
+            
+            // Get existing comments to check for already mentioned users
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            
+            const issueBody = context.payload.issue.body || '';
+            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
+            
+            // Process each label that was added
+            for (const label of labelsAdded) {
+              if (ccConfig[label]) {
+                const config = ccConfig[label];
+                const usersToMention = [];
+                
+                // Check which users haven't been mentioned yet
+                for (const user of config.users) {
+                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
+                  if (!mentionPattern.test(allExistingText)) {
+                    usersToMention.push(user);
+                  } else {
+                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
+                  }
+                }
+                
+                // Post comment if there are users to mention
+                if (usersToMention.length > 0) {
+                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
+                  const message = config.message.replace('{users}', mentions);
+                  
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    body: message
+                  });
+                  
+                  core.notice(`CC comment added for label "${label}": ${mentions}`);
+                } else {
+                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
+                }
+              }
+            }
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@ -4,7 +4,6 @@ MD013: false
 MD024:
  siblings_only: true
 MD033: false
-MD042: false
 MD045: false
 MD046: false
 MD051: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,30 +6,19 @@ default_stages:
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
- repo: https://github.com/google/yapf
-  rev: v0.43.0
-  hooks:
-  - id: yapf
-    args: [--in-place, --verbose]
-    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
-    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
+  rev: v0.14.0
  hooks:
-  - id: ruff
+  - id: ruff-check
    args: [--output-format, github, --fix]
  - id: ruff-format
-    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.35.5
+  rev: v1.38.1
  hooks:
  - id: typos
- repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
-  hooks:
-  - id: isort
+    args: [--force-exclude]
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
+  rev: v21.1.2
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -46,10 +35,10 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.17
+  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
@ -60,38 +49,32 @@ repos:
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
-    entry: tools/mypy.sh 0 "local"
-    language: python
-    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+    entry: python tools/pre_commit/mypy.py 0 "local"
    stages: [pre-commit] # Don't run in CI
-  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.9
-    entry: tools/mypy.sh 1 "3.9"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
+    <<: &mypy_common
+      language: python
+      types_or: [python, pyi]
+      require_serial: true
+      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
-    entry: tools/mypy.sh 1 "3.10"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
+    entry: python tools/pre_commit/mypy.py 1 "3.10"
+    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
-    entry: tools/mypy.sh 1 "3.11"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
+    entry: python tools/pre_commit/mypy.py 1 "3.11"
+    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
-    entry: tools/mypy.sh 1 "3.12"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
+    entry: python tools/pre_commit/mypy.py 1 "3.12"
+    <<: *mypy_common
+    stages: [manual] # Only run in CI
+  - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.13
+    entry: python tools/pre_commit/mypy.py 1 "3.13"
+    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
@ -155,11 +138,10 @@ repos:
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/check_pickle_imports.py
+    entry: python tools/pre_commit/check_pickle_imports.py
    language: python
    types: [python]
-    pass_filenames: false
-    additional_dependencies: [pathspec, regex]
+    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/validate_config.py
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -13,6 +13,7 @@ build:

 mkdocs:
  configuration: mkdocs.yaml
+  fail_on_warning: true

 # Optionally declare the Python requirements required to build your docs
 python:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -86,6 +86,9 @@ find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
+elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
@ -175,6 +178,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

+#
+# Set compression mode for CUDA >=13.x.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA" AND
+   DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
+endif()
+
 #
 # Set CUDA include flags for CXX compiler.
 #
@ -257,8 +269,8 @@ set(VLLM_EXT_SRC
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
-  "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  "csrc/quantization/w8a8/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
@ -270,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -302,13 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
-    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/fp8/per_token_group_quant.cu")
+    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -412,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -440,12 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -470,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -506,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -550,7 +570,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -569,7 +593,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # FP4 Archs and flags
-  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -591,7 +619,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # CUTLASS MLA Archs and flags
-  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
@ -617,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -635,9 +667,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -656,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # moe_data.cu is used by all CUTLASS MoE kernels.
-  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -675,9 +715,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -963,6 +1007,7 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
+    include(cmake/external_projects/qutlass.cmake)

    # vllm-flash-attn should be last as it overwrites some CMake functions
    include(cmake/external_projects/vllm_flash_attn.cmake)
--- a/README.md
+++ b/README.md
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio

 *Latest News* 🔥

+- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@ -148,6 +149,7 @@ Compute Resources:
 - Trainy
 - UC Berkeley
 - UC San Diego
+- Volcengine

 Slack Sponsor: Anyscale

--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -74,7 +74,7 @@ start_server() {
    local vllm_log=$4
    local profile_dir=$5

-    pkill -if vllm
+    pkill -if "vllm serve" || true

    # Define the common arguments as a bash array.
    # Each argument and its value are separate elements.
@ -96,17 +96,22 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
-        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
+        VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
+    local server_pid=$!

    # wait for 10 minutes...
    server_started=0
    for i in {1..60}; do
+        # This line checks whether the server is still alive or not,
+        # since that we should always have permission to send signal to the server process.
+        kill -0 $server_pid 2> /dev/null || break
+
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -118,7 +123,7 @@ start_server() {
    done

    if (( ! server_started )); then
-        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
+        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
        return 1
    else
        return 0
@ -134,7 +139,7 @@ run_benchmark() {
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
-    pkill -if vllm
+    pkill -if "vllm serve" || true

    echo "starting server..."
    # Call start_server without a profile_dir to avoid profiling overhead
@ -227,7 +232,7 @@ run_benchmark() {

    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"

-    pkill -if vllm
+    pkill -if "vllm serve" || true
    sleep 10
    echo "===================="
    return 0
@ -303,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
 else
    echo "No configuration met the latency requirements. Skipping final profiling run."
 fi
-pkill -if vllm
+pkill -if "vllm serve" || true
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -8,7 +8,6 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import Optional, Union

 import aiohttp
 import huggingface_hub.constants
@ -28,13 +27,13 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict | list[dict]] = None
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
    ignore_eos: bool = False
-    language: Optional[str] = None
-    request_id: Optional[str] = None
+    language: str | None = None
+    request_id: str | None = None


@dataclass
@ -52,7 +51,7 @@ class RequestFuncOutput:

 async def async_request_tgi(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -133,7 +132,7 @@ async def async_request_tgi(

 async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -204,7 +203,7 @@ async def async_request_trt_llm(

 async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(

 async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -367,7 +366,7 @@ async def async_request_openai_completions(

 async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("chat/completions", "profile")), (
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(

 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
@ -610,7 +609,7 @@ def get_tokenizer(
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
    if pretrained_model_name_or_path is not None and not os.path.exists(
        pretrained_model_name_or_path
    ):
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc

+from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from benchmark_utils import TimeCollector
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool

--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -1,17 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
+import time
+from unittest import mock

 import numpy as np
+from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from benchmark_utils import TimeCollector
-from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
+from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner


-def main(args):
+def benchmark_propose(args):
    rows = []
    for max_ngram in args.max_ngram:
        collector = TimeCollector(TimeCollector.US)
@ -69,10 +83,88 @@ def main(args):
    )


+def benchmark_batched_propose(args):
+    NUM_SPECULATIVE_TOKENS_NGRAM = 10
+    PROMPT_LOOKUP_MIN = 5
+    PROMPT_LOOKUP_MAX = 15
+    MAX_MODEL_LEN = int(1e7)
+    DEVICE = current_platform.device_type
+
+    model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
+
+    speculative_config = SpeculativeConfig(
+        target_model_config=model_config,
+        target_parallel_config=ParallelConfig(),
+        method="ngram",
+        num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
+        prompt_lookup_max=PROMPT_LOOKUP_MAX,
+        prompt_lookup_min=PROMPT_LOOKUP_MIN,
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        speculative_config=speculative_config,
+        device_config=DeviceConfig(device=current_platform.device_type),
+        parallel_config=ParallelConfig(),
+        load_config=LoadConfig(),
+        scheduler_config=SchedulerConfig(),
+    )
+
+    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
+    mock_pp_group = mock.MagicMock()
+    mock_pp_group.world_size = 1
+    with mock.patch(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
+    ):
+        runner = GPUModelRunner(vllm_config, DEVICE)
+
+        # hack max model len
+        runner.max_model_len = MAX_MODEL_LEN
+        runner.drafter.max_model_len = MAX_MODEL_LEN
+
+        dummy_input_batch = InputBatch(
+            max_num_reqs=args.num_req,
+            max_model_len=MAX_MODEL_LEN,
+            max_num_batched_tokens=args.num_req * args.num_token,
+            device=DEVICE,
+            pin_memory=False,
+            vocab_size=256000,
+            block_sizes=[16],
+        )
+        dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
+        dummy_input_batch.spec_decode_unsupported_reqs = ()
+        dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
+        dummy_input_batch.token_ids_cpu = np.random.randint(
+            0, 20, (args.num_req, args.num_token)
+        )
+
+        runner.input_batch = dummy_input_batch
+
+        sampled_token_ids = [[0]] * args.num_req
+
+        print("Starting benchmark")
+        # first run is warmup so ignore it
+        for _ in range(args.num_iteration):
+            start = time.time()
+            runner.drafter.propose(
+                sampled_token_ids,
+                dummy_input_batch.req_ids,
+                dummy_input_batch.num_tokens_no_spec,
+                dummy_input_batch.token_ids_cpu,
+                dummy_input_batch.spec_decode_unsupported_reqs,
+            )
+            end = time.time()
+            print(f"Iteration time (s): {end - start}")
+
+
 def invoke_main() -> None:
    parser = FlexibleArgumentParser(
        description="Benchmark the performance of N-gram speculative decode drafting"
    )
+    parser.add_argument(
+        "--batched", action="store_true", help="consider time to prepare batch"
+    )
    parser.add_argument(
        "--num-iteration",
        type=int,
@ -105,8 +197,17 @@ def invoke_main() -> None:
        help="Number of speculative tokens to generate",
    )
    args = parser.parse_args()
-    main(args)
+
+    if not args.batched:
+        benchmark_propose(args)
+    else:
+        benchmark_batched_propose(args)


+"""
+# Example command lines:
+# time python3 benchmarks/benchmark_ngram_proposer.py
+# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
+"""  # noqa: E501
 if __name__ == "__main__":
    invoke_main()  # pragma: no cover
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -32,7 +32,6 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional

 from transformers import PreTrainedTokenizerBase

@ -80,7 +79,7 @@ def sample_requests_from_dataset(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@ -128,7 +127,7 @@ def sample_requests_from_random(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
    prefix_len: int,
 ) -> list[Request]:
    requests = []
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -7,7 +7,6 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional

 from transformers import AutoTokenizer, PreTrainedTokenizerBase

@ -24,7 +23,7 @@ def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[tuple[str, int, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,20 +31,19 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
+from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import Optional

 import datasets
 import numpy as np
 import pandas as pd
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-
 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
    RequestFuncInput,
    RequestFuncOutput,
 )
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase

 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
@ -317,7 +316,7 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: list[str],
    selected_percentiles: list[float],
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
    actual_output_lens: list[int] = []
    total_input = 0
@ -437,9 +436,9 @@ async def benchmark(
    selected_percentile_metrics: list[str],
    selected_percentiles: list[str],
    ignore_eos: bool,
-    max_concurrency: Optional[int],
+    max_concurrency: int | None,
    structured_output_ratio: float,
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -449,7 +448,8 @@ async def benchmark(
    def prepare_extra_body(request) -> dict:
        extra_body = {}
        # Add the schema to the extra_body
-        extra_body[request.structure_type] = request.schema
+        extra_body["structured_outputs"] = {}
+        extra_body["structured_outputs"][request.structure_type] = request.schema
        return extra_body

    print("Starting initial single prompt test run...")
@ -502,15 +502,9 @@ async def benchmark(

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()

    async def limited_request_func(request_func_input, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)

@ -909,13 +903,13 @@ def create_argument_parser():
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",
    )
    parser.add_argument(
        "--num-prompts",
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any, Optional, Union
+from typing import Any


 def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
    def __init__(self, scale: int) -> None:
        self.cnt: int = 0
        self._sum: int = 0
-        self._max: Optional[int] = None
+        self._max: int | None = None
        self.scale = scale
        self.start_time: int = time.monotonic_ns()

@ -104,13 +104,13 @@ class TimeCollector:
        else:
            self._max = max(self._max, v)

-    def avg(self) -> Union[float, str]:
+    def avg(self) -> float | str:
        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"

-    def max(self) -> Union[float, str]:
+    def max(self) -> float | str:
        return self._max / self.scale if self._max else "N/A"

-    def dump_avg_max(self) -> list[Union[float, str]]:
+    def dump_avg_max(self) -> list[float | str]:
        return [self.avg(), self.max()]

    def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:

    def __exit__(
        self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        exc_traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
    ) -> None:
        self.collect(time.monotonic_ns() - self.start_time)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable
+from collections.abc import Callable, Iterable

 import torch
 import torch.utils.benchmark as TBenchmark
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable, Optional
+from collections.abc import Callable, Iterable

 import torch
 import torch.utils.benchmark as TBenchmark
@ -17,7 +16,7 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    w8a8_triton_block_scaled_mm,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv

@ -53,7 +52,7 @@ def bench_int8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
@ -108,7 +107,7 @@ def bench_fp8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
@ -158,7 +157,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
@ -183,7 +182,7 @@ def bench(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
    dtype: torch.dtype,
    MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -55,9 +55,7 @@ benchmark() {
  output_len=$2


-  CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
@ -65,9 +63,7 @@ benchmark() {
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


-  CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -38,16 +38,12 @@ wait_for_server() {
 launch_chunked_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
    --port 8100 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
    --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
    --port 8200 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
@ -62,18 +58,14 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

-  CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
+  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -3,10 +3,9 @@

 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional

 import torch
 import torch.utils.benchmark as TBenchmark
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -68,7 +67,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -85,7 +84,7 @@ def unfused_fp8_impl(
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
--- a/benchmarks/kernels/bench_mxfp4_qutlass.py
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "mxfp4": dict(no_a_quant=False, enabled=True),
+    "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+    return (
+        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+        * group_size**-0.5
+    )
+
+
+def _quant_weight_mxfp4(
+    b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
+):
+    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
+        b, forward_hadamard_matrix, method="abs_max"
+    )
+    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
+    return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
+    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
+        b, forward_hadamard_matrix, device
+    )
+    alpha = torch.tensor([1.0], device="cuda")
+
+    if cfg["no_a_quant"]:
+        # Pre-quantize activation
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+            a, forward_hadamard_matrix, method="abs_max"
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+
+        def run():
+            return matmul_mxf4_bf16_tn(
+                input_hf_e2m1,
+                weight_hf_e2m1,
+                input_hf_scale_block,
+                weight_hf_scale_block,
+                alpha,
+            )
+
+        return run
+
+    # Quantize activation on-the-fly
+    def run():
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+            a, forward_hadamard_matrix, method="abs_max"
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+        return matmul_mxf4_bf16_tn(
+            input_hf_e2m1,
+            weight_hf_e2m1,
+            input_hf_scale_block,
+            weight_hf_scale_block,
+            alpha,
+        )
+
+    return run
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[
+            1,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            4096,
+            8192,
+            16384,
+            24576,
+            32768,
+        ],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs MXFP4 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_mxfp4_runner(
+            cfg, a, b, forward_hadamard_matrix, dtype, device
+        )
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), rep=200, quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.3-70B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        for had_size in [32, 64, 128]:
+            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
+            benchmark.run(
+                print_data=True,
+                show_plots=True,
+                save_path=f"bench_mxfp4_res_n{N}_k{K}",
+                N=N,
+                K=K,
+                had_size=had_size,
+            )
+
+    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@ -3,6 +3,7 @@
 import argparse
 import copy
 import itertools
+import os

 import torch
 from weight_shapes import WEIGHT_SHAPES
@ -23,21 +24,45 @@ PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "nvfp4": dict(no_a_quant=False, enabled=True),
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+    "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
+    "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
 }

+_needs_fbgemm = any(
+    v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
+)
+if _needs_fbgemm:
+    try:
+        from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
+            triton_scale_nvfp4_quant,
+        )
+    except ImportError:
+        print(
+            "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
+            "These providers will be skipped. Please install fbgemm_gpu with: "
+            "'pip install fbgemm-gpu-genai' to run them."
+        )
+        # Disable FBGEMM providers so the benchmark can run.
+        for cfg in PROVIDER_CFGS.values():
+            if cfg.get("fbgemm"):
+                cfg["enabled"] = False
+
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]


-def _quant_weight_nvfp4(b: torch.Tensor, device: str):
+def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
    # Compute global scale for weight
    b_amax = torch.abs(b).max().to(torch.float32)
    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
-    b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
+    if "fbgemm" in cfg and cfg["fbgemm"]:
+        b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
+    else:
+        b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
    return b_fp4, scale_b_fp4, b_global_scale


 def build_nvfp4_runner(cfg, a, b, dtype, device):
-    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
+    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)

    # Compute global scale for activation
    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):

    # Alpha for the GEMM operation
    alpha = 1.0 / (a_global_scale * b_global_scale)
+    if "fbgemm" in cfg and cfg["fbgemm"]:
+        if cfg["no_a_quant"]:
+            a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+
+            def run():
+                return torch.ops.fbgemm.f4f4bf16(
+                    a_fp4,
+                    b_fp4,
+                    scale_a_fp4,
+                    scale_b_fp4,
+                    global_scale=alpha,
+                    use_mx=False,
+                )
+
+            return run
+        else:
+
+            def run():
+                a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+                return torch.ops.fbgemm.f4f4bf16(
+                    a_fp4,
+                    b_fp4,
+                    scale_a_fp4,
+                    scale_b_fp4,
+                    global_scale=alpha,
+                    use_mx=False,
+                )
+
+            return run

    if cfg["no_a_quant"]:
        # Pre-quantize activation
@ -130,10 +184,13 @@ if __name__ == "__main__":

    for K, N, model in prepare_shapes(args):
        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+        save_dir = f"bench_nvfp4_res_n{N}_k{K}"
+        os.makedirs(save_dir, exist_ok=True)
+
        benchmark.run(
            print_data=True,
            show_plots=True,
-            save_path=f"bench_nvfp4_res_n{N}_k{K}",
+            save_path=save_dir,
            N=N,
            K=K,
        )
--- a/benchmarks/kernels/bench_nvfp4_qutlass.py
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
+from vllm._custom_ops import fusedQuantizeNv
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "nvfp4": dict(no_a_quant=False, enabled=True),
+    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+    return (
+        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+        * group_size**-0.5
+    )
+
+
+def _quant_weight_nvfp4(
+    b: torch.Tensor,
+    forward_hadamard_matrix: torch.Tensor,
+    global_scale: torch.Tensor,
+    device: str,
+    M: int,
+    N: int,
+    K: int,
+):
+    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
+        b, forward_hadamard_matrix, global_scale
+    )
+    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
+        -1, K // 16
+    )
+    return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
+    alpha = torch.tensor([1.0], device="cuda")
+    global_scale = torch.tensor([1.0], device="cuda")
+    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
+        b, forward_hadamard_matrix, global_scale, device, M, N, K
+    )
+
+    if cfg["no_a_quant"]:
+        # Pre-quantize activation
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+            a, forward_hadamard_matrix, global_scale
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+            -1, K // 16
+        )
+
+        def run():
+            return ops.cutlass_scaled_fp4_mm(
+                input_hf_e2m1,
+                weight_hf_e2m1,
+                input_hf_scale_block,
+                weight_hf_scale_block,
+                alpha,
+                torch.bfloat16,
+            )
+
+        return run
+
+    # Quantize activation on-the-fly
+    def run():
+        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+            a, forward_hadamard_matrix, global_scale
+        )
+        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+            -1, K // 16
+        )
+        return ops.cutlass_scaled_fp4_mm(
+            input_hf_e2m1,
+            weight_hf_e2m1,
+            input_hf_scale_block,
+            weight_hf_scale_block,
+            alpha,
+            torch.bfloat16,
+        )
+
+    return run
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[
+            1,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            4096,
+            8192,
+            16384,
+            24576,
+            32768,
+        ],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs NVFP4 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_nvfp4_runner(
+            cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
+        )
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), rep=200, quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.3-70B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        for had_size in [16, 32, 64, 128]:
+            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+            benchmark.run(
+                print_data=True,
+                show_plots=True,
+                save_path=f"bench_nvfp4_res_n{N}_k{K}",
+                N=N,
+                K=K,
+                had_size=had_size,
+            )
+
+    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import patch

 import pandas as pd
@ -10,7 +10,8 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


 def with_triton_mode(fn):
@ -51,7 +52,7 @@ def calculate_diff(
 ):
    """Calculate the difference between Inductor and CUDA implementations."""
    device = torch.device("cuda")
-    x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
+    x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)

    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)

@ -59,23 +60,25 @@ def calculate_diff(
    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)

-    out_allclose = lambda o1, o2: torch.allclose(
-        o1.to(torch.float32),
-        o2.to(torch.float32),
-        rtol=1e-3,
-        atol=1e-5,
-    )
-    scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
-
-    if (
-        out_allclose(cuda_out, torch_out)
-        and scale_allclose(cuda_scale, torch_scale)
-        and out_allclose(cuda_out, torch_eager_out)
-        and scale_allclose(cuda_scale, torch_eager_scale)
-    ):
+    try:
+        torch.testing.assert_close(
+            cuda_out.to(torch.float32),
+            torch_out.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
+        torch.testing.assert_close(
+            cuda_out.to(torch.float32),
+            torch_eager_out.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
        print("✅ All implementations match")
-    else:
+    except AssertionError as e:
        print("❌ Implementations differ")
+        print(e)


 configs = []
@ -91,7 +94,7 @@ def benchmark_quantization(
 ):
    device = torch.device("cuda")

-    x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
+    x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]
    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
@ -157,21 +160,21 @@ if __name__ == "__main__":
    )
    parser.add_argument("-c", "--check", action="store_true")
    parser.add_argument(
-        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
    )
    parser.add_argument(
        "--hidden-sizes",
        type=int,
        nargs="+",
-        default=None,
-        help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
+        default=[896, 1024, 2048, 4096, 7168],
+        help="Hidden sizes to benchmark",
    )
    parser.add_argument(
        "--batch-sizes",
        type=int,
        nargs="+",
-        default=None,
-        help="Batch sizes to benchmark (default: 1,16,32,64,128)",
+        default=[1, 16, 128, 512, 1024],
+        help="Batch sizes to benchmark",
    )
    parser.add_argument(
        "--group-sizes",
@ -192,8 +195,8 @@ if __name__ == "__main__":

    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]

-    hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-    batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
+    hidden_sizes = args.hidden_sizes
+    batch_sizes = args.batch_sizes

    if args.group_sizes is not None:
        group_shapes = []
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
+kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
+but use different quantization strategies and backends.
+"""
+
+import nvtx
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+# Weight shapes for different models: [num_experts, topk, hidden_size,
+# intermediate_size]
+WEIGHT_SHAPES_MOE = {
+    "mixtral-8x7b": [
+        [8, 2, 4096, 14336],
+    ],
+    "deepseek-v2": [
+        [160, 6, 5120, 12288],
+    ],
+    "custom-small": [
+        [8, 2, 2048, 7168],
+    ],
+    "glm45-fp8": [
+        [128, 8, 4096, 1408],
+    ],
+    "Llama-4-Maverick-17B-128E-Instruct-FP8": [
+        [128, 1, 5120, 8192],
+    ],
+}
+
+DEFAULT_MODELS = [
+    "mixtral-8x7b",
+]
+
+DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False, True]
+PER_OUT_CH_OPTS = [False, True]
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+def bench_run(
+    results: list,
+    model: str,
+    num_experts: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    mkn: tuple[int, int, int],
+):
+    (m, k, n) = mkn
+
+    dtype = torch.half
+    device = "cuda"
+
+    # Create input activations
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+
+    # Create weights
+    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
+
+    # Create FP8 quantized weights and scales for both kernels
+    w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
+    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
+
+    # Create scales based on quantization strategy
+    if per_out_ch:
+        # Per-channel quantization
+        w1_scale = torch.empty(
+            (num_experts, 2 * n, 1), device=device, dtype=torch.float32
+        )
+        w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
+    else:
+        # Per-tensor quantization
+        w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+        w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+
+    # Quantize weights
+    for expert in range(num_experts):
+        if per_out_ch:
+            # Per-channel quantization - not yet implemented properly
+            # For now, fall back to per-tensor quantization
+            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+            # Expand scalar scales to the expected per-channel shape
+            w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
+            w2_scale[expert] = w2_scale_temp.expand(k, 1)
+        else:
+            # Per-tensor quantization
+            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+            # Store scalar scales in [1, 1] tensors
+            w1_scale[expert, 0, 0] = w1_scale_temp
+            w2_scale[expert, 0, 0] = w2_scale_temp
+
+    # Prepare weights for CUTLASS (no transpose needed)
+    w1_fp8q_cutlass = w1_fp8q  # Keep original [E, 2N, K]
+    w2_fp8q_cutlass = w2_fp8q  # Keep original [E, K, N]
+
+    # Create router scores and get topk
+    score = torch.randn((m, num_experts), device=device, dtype=dtype)
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+    # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
+    # Force per-tensor quantization for all cases to match working e2e setup
+    a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+    a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+
+    # Force per-tensor quantization for all cases
+    per_act_token = False
+
+    # Create stride tensors for CUTLASS
+    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
+    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
+    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+
+    def run_triton_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        a2_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            per_act_token_quant=per_act_token,
+            per_out_ch_quant=per_out_ch,
+        )
+
+        for _ in range(num_repeats):
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                quant_config=quant_config,
+            )
+
+    def run_cutlass_moe_fp8(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        a2_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            per_act_token_quant=per_act_token,
+            per_out_ch_quant=per_out_ch,
+        )
+
+        for _ in range(num_repeats):
+            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
+                cutlass_moe_fp8(
+                    a=a,
+                    w1_q=w1,
+                    w2_q=w2,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    ab_strides1=ab_strides1,
+                    ab_strides2=ab_strides2,
+                    c_strides1=c_strides1,
+                    c_strides2=c_strides2,
+                    quant_config=quant_config,
+                    activation="silu",
+                    global_num_experts=num_experts,
+                )
+
+    # Pre-create quantization config to avoid creating it inside CUDA graph
+    quant_config = fp8_w8a8_moe_quant_config(
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        per_act_token_quant=per_act_token,
+        per_out_ch_quant=per_out_ch,
+    )
+
+    # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        # Capture 10 invocations like benchmark_moe.py
+        for _ in range(10):
+            cutlass_moe_fp8(
+                a=a,
+                w1_q=w1_fp8q_cutlass,
+                w2_q=w2_fp8q_cutlass,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                ab_strides1=ab_strides1,
+                ab_strides2=ab_strides2,
+                c_strides1=c_strides1,
+                c_strides2=c_strides2,
+                quant_config=quant_config,
+                activation="silu",
+                global_num_experts=num_experts,
+            )
+    torch.cuda.synchronize()
+
+    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        # Capture 10 invocations like benchmark_moe.py
+        for _ in range(10):
+            fused_experts(
+                a,
+                w1_fp8q,
+                w2_fp8q,
+                topk_weights,
+                topk_ids,
+                quant_config=quant_config,
+            )
+    torch.cuda.synchronize()
+
+    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
+        """Benchmark CUDA graph using events like benchmark_moe.py"""
+        # Warmup
+        for _ in range(num_warmup):
+            graph.replay()
+        torch.cuda.synchronize()
+
+        # Timing
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        latencies = []
+        for _ in range(num_iters):
+            torch.cuda.synchronize()
+            start_event.record()
+            graph.replay()
+            end_event.record()
+            end_event.synchronize()
+            latencies.append(start_event.elapsed_time(end_event))
+
+        # Divide by 10 since graph contains 10 calls
+        return sum(latencies) / (num_iters * 10)
+
+    # Benchmark parameters
+    num_warmup = 5
+    num_iters = 100
+
+    # Benchmark only CUDA graphs (more reliable and faster)
+    # Benchmark Triton MoE with CUDA graphs
+    triton_graph_time = bench_cuda_graph(
+        triton_graph, num_warmup=num_warmup, num_iters=num_iters
+    )
+
+    # Benchmark CUTLASS MoE with CUDA graphs
+    cutlass_graph_time = bench_cuda_graph(
+        cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
+    )
+
+    # Convert ms to us and return results
+    triton_time_us = triton_graph_time * 1000
+    cutlass_time_us = cutlass_graph_time * 1000
+
+    return {
+        "batch_size": m,
+        "triton_time_us": triton_time_us,
+        "cutlass_time_us": cutlass_time_us,
+    }
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    all_results = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in args.per_act_token_opts:
+                    for per_out_ch in args.per_out_ch_opts:
+                        print(
+                            f"\n=== {model}, experts={num_experts}, topk={topk},"
+                            f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
+                        )
+
+                        config_results = []
+                        for size_m in args.batch_sizes:
+                            mkn = (size_m, size_k, size_n)
+                            result = bench_run(
+                                [],  # Not used anymore
+                                model,
+                                num_experts,
+                                topk,
+                                per_act_token,
+                                per_out_ch,
+                                mkn,
+                            )
+                            if result:
+                                config_results.append(result)
+
+                        # Print results table for this configuration
+                        if config_results:
+                            print(
+                                f"\n{'Batch Size':<12}"
+                                f"{'Triton (us)':<15}"
+                                f"{'CUTLASS (us)':<15}"
+                            )
+                            print("-" * 45)
+                            for result in config_results:
+                                print(
+                                    f"{result['batch_size']:<12}"
+                                    f"{result['triton_time_us']:<15.2f}"
+                                    f"{result['cutlass_time_us']:<15.2f}"
+                                )
+
+                            all_results.extend(config_results)
+
+    print(f"\nTotal benchmarks completed: {len(all_results)}")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
+         across specified models/shapes/batches
+
+        Example usage:
+        python benchmark_cutlass_moe_fp8.py  \
+            --model "Llama-4-Maverick-17B-128E-Instruct-FP8"  \
+            --tp-sizes 8 \
+            --batch-size 2 4 8  \
+            --per-act-token-opts false \
+            --per-out-ch-opts false
+
+        """
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument(
+        "--per-act-token-opts",
+        nargs="+",
+        type=lambda x: x.lower() == "true",
+        default=[False, True],
+        help="Per-activation token quantization options (true/false)",
+    )
+    parser.add_argument(
+        "--per-out-ch-opts",
+        nargs="+",
+        type=lambda x: x.lower() == "true",
+        default=[False, True],
+        help="Per-output channel quantization options (true/false)",
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -7,6 +7,10 @@ Benchmark script for device communicators:
 CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 and SymmMemCommunicator (multimem, two-shot).

+for NCCL symmetric memory you need to set the environment variables
+NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
+not use fast NVLS implementation for all reduce.
+
 Usage:
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]

@ -18,15 +22,21 @@ Example:
 import json
 import os
 import time
+from collections.abc import Callable
 from contextlib import nullcontext
-from typing import Callable, Optional

 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup

 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
-from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.pynccl import (
+    PyNcclCommunicator,
+    register_nccl_symmetric_ops,
+)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id,
+)
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
 from vllm.utils import FlexibleArgumentParser
@ -98,6 +108,7 @@ class CommunicatorBenchmark:
            )
            if not self.pynccl_comm.disabled:
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+                register_nccl_symmetric_ops(self.pynccl_comm)
            else:
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
                self.pynccl_comm = None
@ -194,6 +205,15 @@ class CommunicatorBenchmark:
                    None,  # no env variable needed
                )
            )
+            communicators.append(
+                (
+                    "pynccl-symm",
+                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
+                    lambda t: True,  # Always available if initialized
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )

        if self.symm_mem_comm_multimem is not None:
            comm = self.symm_mem_comm_multimem
@ -244,12 +264,12 @@ class CommunicatorBenchmark:
    def benchmark_allreduce_single(
        self,
        sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
        should_use_fn: Callable[[torch.Tensor], bool],
        context,
        num_warmup: int,
        num_trials: int,
-    ) -> Optional[float]:
+    ) -> float | None:
        """Benchmark method with CUDA graph optimization."""
        try:
            # Create test tensor (2D: sequence_length x hidden_size)
@ -271,7 +291,9 @@ class CommunicatorBenchmark:
                # Capture the graph using context manager
                with context:
                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph):
+                    graph_pool = torch.cuda.graph_pool_handle()
+                    set_graph_pool_id(graph_pool)
+                    with torch.cuda.graph(graph, pool=graph_pool):
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -7,7 +7,8 @@ import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -6,11 +6,12 @@ import copy
 import json
 import pickle
 import time
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any, Callable, Optional
+from typing import Any

 import torch
 import torch.utils.benchmark as TBenchmark
@ -79,9 +80,9 @@ def make_rand_lora_weight_tensor(


 def make_rand_tensors(
-    a_shape: tuple[int],
-    b_shape: tuple[int],
-    c_shape: tuple[int],
+    a_shape: tuple[int, ...],
+    b_shape: tuple[int, ...],
+    c_shape: tuple[int, ...],
    a_dtype: torch.dtype,
    b_dtype: torch.dtype,
    c_dtype: torch.dtype,
@ -158,7 +159,7 @@ def ref_group_gemm(
    seq_lens_cpu: torch.Tensor,
    prompt_lora_mapping_cpu: torch.Tensor,
    scaling: float,
-    add_inputs: Optional[bool],
+    add_inputs: bool | None,
 ):
    """
    Torch group gemm reference implementation to test correctness of
@ -243,7 +244,7 @@ class OpType(Enum):
        lora_rank: int,
        num_loras: int,
        num_slices: int,
-    ) -> tuple[tuple[int], tuple[int], tuple[int]]:
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        """
        Given num_slices, return the shapes of the A, B, and C matrices
        in A x B = C, for the op_type
@ -316,8 +317,8 @@ class BenchmarkContext:
    lora_rank: int
    sort_by_lora_id: bool
    dtype: torch.dtype
-    seq_length: Optional[int] = None
-    num_slices: Optional[int] = None  # num_slices for slice based ops
+    seq_length: int | None = None
+    num_slices: int | None = None  # num_slices for slice based ops

    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
        ctx = copy.copy(self)
@ -561,7 +562,7 @@ class BenchmarkTensors:
        }

    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: Optional[bool] = None
+        self, op_type: OpType, add_inputs: bool | None = None
    ) -> dict[str, Any]:
        if op_type.is_shrink_fn():
            assert add_inputs is None
@ -575,7 +576,7 @@ class BenchmarkTensors:
        raise ValueError(f"Unrecognized optype {self}")

    def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
+        self, op_type: OpType, expand_fn_add_inputs: bool | None
    ) -> bool:
        """
        Test correctness of op_type implementation against a grouped gemm
@ -611,8 +612,8 @@ def bench_optype(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
-    expand_fn_add_inputs: Optional[bool] = None,
+    cuda_graph_nops: int | None = None,
+    expand_fn_add_inputs: bool | None = None,
    test_correctness: bool = False,
 ) -> TMeasurement:
    assert arg_pool_size >= 1
@ -679,7 +680,7 @@ def bench_torch_mm(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
+    cuda_graph_nops: int | None = None,
 ) -> TMeasurement:
    """
    Benchmark basic torch.mm as a roofline.
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
            """


-def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
+def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
    compare = TBenchmark.Compare(timers)
    compare.print()

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -8,10 +8,9 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional

 import pandas as pd
 import torch
@ -63,23 +62,23 @@ class BenchmarkTensors:
    a: torch.Tensor

    w_q: torch.Tensor
-    group_size: Optional[int]
+    group_size: int | None
    wtype: ScalarType
    w_g_s: torch.Tensor
-    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: Optional[torch.Tensor]
+    w_g_zp: torch.Tensor | None
+    w_ch_s: torch.Tensor | None
+    w_tok_s: torch.Tensor | None


@dataclass
 class TypeConfig:
    act_type: torch.dtype
    weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    group_zero_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None


 def rand_data(shape, dtype=torch.float16, scale=1):
@ -93,8 +92,8 @@ def quantize_and_pack(
    atype: torch.dtype,
    w: torch.Tensor,
    wtype: ScalarType,
-    stype: Optional[torch.dtype],
-    group_size: Optional[int],
+    stype: torch.dtype | None,
+    group_size: int | None,
    zero_points: bool = False,
 ):
    assert wtype.is_integer(), "TODO: support floating point weights"
@ -113,7 +112,7 @@ def quantize_and_pack(


 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
 ) -> list[BenchmarkTensors]:
    m, n, k = shape

@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
    return res


-_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
+_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None


 def bench(
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -579,18 +579,22 @@ def main(args: argparse.Namespace):
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
        intermediate_size = config.ffn_config.ffn_hidden_size
+        hidden_size = config.hidden_size
    elif config.architectures[0] == "JambaForCausalLM":
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
+        hidden_size = config.hidden_size
    elif config.architectures[0] in (
-        "DeepseekV3ForCausalLM",
        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+        "DeepseekV32ForCausalLM",
        "Glm4MoeForCausalLM",
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
+        hidden_size = config.hidden_size
    elif config.architectures[0] in (
        "Qwen2MoeForCausalLM",
        "Qwen3MoeForCausalLM",
@ -599,10 +603,18 @@ def main(args: argparse.Namespace):
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
+        hidden_size = config.hidden_size
+    elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
+        text_config = config.get_text_config()
+        E = text_config.num_experts
+        topk = text_config.num_experts_per_tok
+        intermediate_size = text_config.moe_intermediate_size
+        hidden_size = text_config.hidden_size
    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
        E = config.num_experts
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
+        hidden_size = config.hidden_size
    else:
        # Support for llama4
        config = config.get_text_config()
@ -610,6 +622,7 @@ def main(args: argparse.Namespace):
        E = config.num_local_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
+        hidden_size = config.hidden_size
    enable_ep = bool(args.enable_expert_parallel)
    if enable_ep:
        ensure_divisibility(E, args.tp_size, "Number of experts")
@ -618,8 +631,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok

    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -3,16 +3,15 @@

 import random
 import time
-from typing import Optional

 import torch

 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )

@ -37,7 +36,7 @@ def main(
    seed: int,
    do_profile: bool,
    device: str = "cuda",
-    kv_cache_dtype: Optional[str] = None,
+    kv_cache_dtype: str | None = None,
 ) -> None:
    current_platform.seed_everything(seed)

--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@ -3,8 +3,8 @@

 import argparse
 import math
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Callable
 from unittest.mock import patch

 import torch
--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -1,155 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-
-import torch
-
-from vllm import _custom_ops as vllm_ops
-from vllm.triton_utils import triton
-
-
-def polynorm_naive(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    def norm(x, eps: float):
-        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-
-    x = x.float()
-    return (
-        (
-            weight[0] * norm(x**3, eps)
-            + weight[1] * norm(x**2, eps)
-            + weight[2] * norm(x, eps)
-            + bias
-        )
-        .to(weight.dtype)
-        .view(orig_shape)
-    )
-
-
-def polynorm_vllm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    out = torch.empty_like(x)
-    vllm_ops.poly_norm(out, x, weight, bias, eps)
-    output = out
-
-    output = output.view(orig_shape)
-    return output
-
-
-def calculate_diff(batch_size, seq_len, hidden_dim):
-    dtype = torch.bfloat16
-    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-    weight = torch.ones(3, dtype=dtype, device="cuda")
-    bias = torch.ones(1, dtype=dtype, device="cuda")
-
-    output_naive = polynorm_naive(x, weight, bias)
-    output_vllm = polynorm_vllm(x, weight, bias)
-
-    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
-        print("✅ All implementations match")
-    else:
-        print("❌ Implementations differ")
-
-
-batch_size_range = [2**i for i in range(0, 7, 2)]
-seq_length_range = [2**i for i in range(6, 11, 1)]
-dim_range = [2048, 4096]
-configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
-
-
-def get_benchmark():
-    @triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["dim", "batch_size", "seq_len"],
-            x_vals=[list(_) for _ in configs],
-            line_arg="provider",
-            line_vals=["naive", "vllm"],
-            line_names=["Naive", "vLLM"],
-            styles=[("blue", "-"), ("red", "-")],
-            ylabel="us",
-            plot_name="polynorm-perf",
-            args={},
-        )
-    )
-    def benchmark(dim, batch_size, seq_len, provider):
-        dtype = torch.bfloat16
-        hidden_dim = dim * 4
-
-        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-        weight = torch.ones(3, dtype=dtype, device="cuda")
-        bias = torch.ones(1, dtype=dtype, device="cuda")
-
-        quantiles = [0.5, 0.2, 0.8]
-
-        if provider == "naive":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_naive(x, weight, bias),
-                quantiles=quantiles,
-            )
-        else:
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_vllm(x, weight, bias),
-                quantiles=quantiles,
-            )
-
-        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
-
-    return benchmark
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=4,
-        help="Batch size",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=128,
-        help="Sequence length",
-    )
-    parser.add_argument(
-        "--hidden-dim",
-        type=int,
-        default=8192,
-        help="Intermediate size of MLP",
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default="./configs/polnorm/",
-        help="Path to save polnorm benchmark results",
-    )
-
-    args = parser.parse_args()
-
-    # Run correctness test
-    calculate_diff(
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        hidden_dim=args.hidden_dim,
-    )
-
-    benchmark = get_benchmark()
-    # Run performance benchmark
-    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -7,7 +7,8 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    create_kv_caches_with_random,
+)
+
+logger = init_logger(__name__)
+
+
+@torch.inference_mode()
+def run_benchmark(
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    num_iters: int,
+    benchmark_mode: str,
+    device: str = "cuda",
+) -> float:
+    """Return latency (seconds) for given num_tokens."""
+
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+
+    current_platform.seed_everything(42)
+    torch.set_default_device(device)
+
+    # create random key / value tensors [T, H, D].
+    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+    value = torch.randn_like(key)
+
+    # prepare the slot mapping.
+    # each token is assigned a unique slot in the KV-cache.
+    num_slots = block_size * num_blocks
+    if num_tokens > num_slots:
+        raise ValueError("num_tokens cannot exceed the total number of cache slots")
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+
+    key_caches, value_caches = create_kv_caches_with_random(
+        num_blocks,
+        block_size,
+        1,  # num_layers
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+    # to free unused memory
+    del key_caches, value_caches
+
+    # compute per-kernel scaling factors for fp8 conversion (if used).
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    function_under_test = lambda: ops.reshape_and_cache(
+        key,  # noqa: F821
+        value,  # noqa: F821
+        key_cache,  # noqa: F821
+        value_cache,  # noqa: F821
+        slot_mapping,  # noqa: F821
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
+
+    if benchmark_mode == "cudagraph":
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            function_under_test()
+        torch.cuda.synchronize()
+        function_under_test = lambda: g.replay()
+
+    def run_cuda_benchmark(n_iters: int) -> float:
+        nonlocal key, value, key_cache, value_cache, slot_mapping
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        for _ in range(n_iters):
+            function_under_test()
+            torch.cuda.synchronize()
+        end = time.perf_counter()
+        return (end - start) / n_iters
+
+    # warm-up
+    run_cuda_benchmark(3)
+
+    lat = run_cuda_benchmark(num_iters)
+
+    # free tensors to mitigate OOM when sweeping
+    del key, value, key_cache, value_cache, slot_mapping
+    torch.cuda.empty_cache()
+
+    return lat
+
+
+def main(args):
+    rows = []
+    for exp in range(1, 17):
+        n_tok = 2**exp
+        lat = run_benchmark(
+            num_tokens=n_tok,
+            num_heads=args.num_heads,
+            head_size=args.head_size,
+            block_size=args.block_size,
+            num_blocks=args.num_blocks,
+            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+            kv_cache_dtype=args.kv_cache_dtype,
+            num_iters=args.iters,
+            benchmark_mode=args.mode,
+            device="cuda",
+        )
+        rows.append([n_tok, lat * 1e6])  # convert to microseconds
+
+    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
+    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+
+    parser.add_argument("--num-heads", type=int, default=128)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--num-blocks", type=int, default=128 * 128)
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["half", "bfloat16", "float"],
+        default="bfloat16",
+    )
+
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8"],
+        default="auto",
+    )
+
+    parser.add_argument("--iters", type=int, default=200)
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["cudagraph", "no_graph"],
+        default="cudagraph",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
 import time

@ -9,11 +7,14 @@ import torch
 from tabulate import tabulate

 from vllm import _custom_ops as ops
+from vllm.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
+)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random_flash,
 )

@ -31,6 +32,8 @@ def run_benchmark(
    kv_cache_dtype: str,
    kv_cache_layout: str,
    num_iters: int,
+    implementation: str,
+    benchmark_mode: str,
    device: str = "cuda",
 ) -> float:
    """Return latency (seconds) for given num_tokens."""
@ -38,6 +41,14 @@ def run_benchmark(
    if kv_cache_dtype == "fp8" and head_size % 16:
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")

+    if implementation not in ("cuda", "triton"):
+        raise ValueError(
+            f"Unsupported implementation: {implementation}. "
+            "Only 'cuda' and 'triton' are supported."
+        )
+    if implementation == "triton" and kv_cache_layout == "HND":
+        return float("nan")  # Triton does not support HND layout yet.
+
    current_platform.seed_everything(42)
    torch.set_default_device(device)

@ -65,27 +76,49 @@ def run_benchmark(
        cache_layout=kv_cache_layout,
    )
    key_cache, value_cache = key_caches[0], value_caches[0]
+    # to free unused memory
+    del key_caches, value_caches

    # compute per-kernel scaling factors for fp8 conversion (if used).
    k_scale = (key.amax() / 64.0).to(torch.float32)
    v_scale = (value.amax() / 64.0).to(torch.float32)

+    if implementation == "cuda":
+        function_under_test = lambda: ops.reshape_and_cache_flash(
+            key,  # noqa: F821
+            value,  # noqa: F821
+            key_cache,  # noqa: F821
+            value_cache,  # noqa: F821
+            slot_mapping,  # noqa: F821
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+    else:
+        function_under_test = lambda: triton_reshape_and_cache_flash(
+            key,  # noqa: F821
+            value,  # noqa: F821
+            key_cache,  # noqa: F821
+            value_cache,  # noqa: F821
+            slot_mapping,  # noqa: F821
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+    if benchmark_mode == "cudagraph":
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            function_under_test()
+        torch.cuda.synchronize()
+        function_under_test = lambda: g.replay()
+
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                slot_mapping,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-            )
-        torch.cuda.synchronize()
+            function_under_test()
+            torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@ -116,10 +149,16 @@ def main(args):
                kv_cache_dtype=args.kv_cache_dtype,
                kv_cache_layout=layout,
                num_iters=args.iters,
+                implementation=args.implementation,
+                benchmark_mode=args.mode,
                device="cuda",
            )
            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])

+    print(
+        f"Benchmark results for implementation {args.implementation}"
+        f" (measuring with {args.mode}):"
+    )
    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))


@ -151,6 +190,21 @@ if __name__ == "__main__":
    )

    parser.add_argument("--iters", type=int, default=100)
+
+    parser.add_argument(
+        "--implementation",
+        type=str,
+        choices=["cuda", "triton"],
+        default="cuda",
+    )
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["cudagraph", "no_graph"],
+        default="cudagraph",
+    )
+
    args = parser.parse_args()

    main(args)
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import itertools
-from typing import Optional, Union

 import torch
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
    def forward(
        self,
        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        orig_dtype = x.dtype
        x = x.to(torch.float32)
        if residual is not None:
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
 def rmsnorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -65,7 +64,7 @@ def rmsnorm_naive(
 def rmsnorm_flashinfer(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
 def rmsnorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from itertools import accumulate
-from typing import Optional

 import nvtx
 import torch
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
    seq_len: int,
    num_heads: int,
    head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
    dtype: torch.dtype,
    seed: int,
    device: str,
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,5 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Comprehensive 3-way SiLU Benchmark Suite
+
+This benchmark compares three SiLU implementations:
+1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
+2. Triton Kernel - Triton-based implementation
+
+The suite generates detailed performance comparisons including:
+- Memory bandwidth utilization
+- Speedup ratios (baseline vs optimized implementations)
+- Performance across different expert configurations and token distributions
+"""
+
 from collections.abc import Callable

 import matplotlib.pyplot as plt
@ -7,7 +21,7 @@ import numpy as np
 import torch

 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm_cuda,
+    persistent_masked_m_silu_mul_quant,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
    num_parallel_tokens,
    group_size: int = 128,
    eps: float = 1e-10,
+    expert_offsets: torch.Tensor = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales

@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(


 # Parse generation strategies
-strategies = ["uniform", "max_t", "first_t"]
+strategies = ["random_imbalanced", "uniform", "max_t"]


 def benchmark(
@ -195,15 +210,27 @@ def benchmark(
        current_platform.seed_everything(42 + seed_offset)
        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()

-        if gen_strategy == "uniform":
-            r = torch.rand(size=(E,), device="cuda")
+        if gen_strategy == "random_imbalanced":
+
+            def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
+                mean = total_tokens // n_e
+                min_max = mean // ratio
+                e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
+                e[0] = min_max
+                r = torch.rand(size=(E - 1,))
+                r /= r.sum()
+                r *= total_tokens - min_max
+                r = r.round().long()
+                e[1:] = r.to(device=device)
+                return e
+
+            tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
+        elif gen_strategy == "uniform":
+            r = torch.rand(size=(E,))
            r /= r.sum()
            r *= total_tokens
-            tokens_per_expert = r.int()
-            tokens_per_expert = torch.minimum(
-                tokens_per_expert,
-                torch.ones((E,), device=r.device, dtype=torch.int) * T,
-            )
+            r = r.round().long()
+            tokens_per_expert = r
        elif gen_strategy == "max_t":
            tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
            tokens_per_expert.fill_(total_tokens / E)
@ -281,40 +308,34 @@ def benchmark(


 def create_comparison_plot(
-    ratio, cuda_times, baseline_times, config_labels, strategy_name, id
+    ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
 ):
-    """Create a comparison plot for a specific generation strategy"""
-    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
+    fig, ax = plt.subplots(1, 1, figsize=(18, 6))

    # Configure x-axis positions
    x = np.arange(len(config_labels))
-    width = 0.35
+    width = 0.25

    # Execution Time plot (lower is better)
+    ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
    ax.bar(
-        x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
-    )
-    ax.bar(
-        x + width / 2,
-        baseline_times,
-        width,
-        label="Baseline",
-        alpha=0.8,
-        color="orange",
+        x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
    )

-    # Add speedup labels over each bar pair
+    # Add speedup labels over each bar trio
    for i in range(len(x)):
-        speedup = ratio[i]
-        max_height = max(cuda_times[i], baseline_times[i])
+        triton_v2_speedup = ratios[i][1]  # triton/v2
+        max_height = max(silu_v2_times[i], triton_times[i])
+
+        # Triton/V2 speedup
        ax.text(
-            x[i],
+            x[i] + width / 2,
            max_height + max_height * 0.02,
-            f"{speedup:.2f}x",
+            f"{triton_v2_speedup:.2f}x",
            ha="center",
            va="bottom",
            fontweight="bold",
-            fontsize=9,
+            fontsize=8,
        )

    ax.set_xlabel("Configuration")
@ -332,56 +353,75 @@ def create_comparison_plot(


 def create_combined_plot(all_results):
-    """Create a combined plot with all strategies in one PNG"""
    num_strategies = len(all_results)
-    fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
+    fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))

    if num_strategies == 1:
        axes = [axes]

    for idx, (
        strategy_name,
-        ratio,
-        cuda_times,
-        baseline_times,
+        all_ratios,
+        all_silu_v2_results,
+        all_triton_results,
        config_labels,
+        config_x_axis,
    ) in enumerate(all_results):
        ax = axes[idx]

+        # Flatten the nested results to get bandwidth percentages for plotting
+        silu_v2_bandwidths = []
+        triton_bandwidths = []
+        flat_ratios = []
+
+        for config_results in all_silu_v2_results:
+            for result in config_results:
+                silu_v2_bandwidths.append(result[3])  # bandwidth percentage
+
+        for config_results in all_triton_results:
+            for result in config_results:
+                triton_bandwidths.append(result[3])  # bandwidth percentage
+
+        for config_ratios in all_ratios:
+            for ratio in config_ratios:
+                flat_ratios.append(ratio)
+
        # Configure x-axis positions
        x = np.arange(len(config_labels))
-        width = 0.35
+        width = 0.25

-        # Execution Time plot (lower is better)
+        # Bandwidth utilization plot (higher is better)
        ax.bar(
-            x - width / 2,
-            cuda_times,
+            x,
+            silu_v2_bandwidths,
            width,
-            label="CUDA Kernel",
+            label="SiLU V2 (CUDA)",
            alpha=0.8,
            color="blue",
        )
        ax.bar(
-            x + width / 2,
-            baseline_times,
+            x + width,
+            triton_bandwidths,
            width,
-            label="Baseline",
+            label="Triton Kernel",
            alpha=0.8,
-            color="orange",
+            color="green",
        )

-        # Add speedup labels over each bar pair
+        # Add speedup labels over each bar trio
        for i in range(len(x)):
-            speedup = ratio[i]
-            max_height = max(cuda_times[i], baseline_times[i])
+            triton_v2_speedup = flat_ratios[i]  # triton/v2
+            max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
+
+            # Triton/V2 speedup
            ax.text(
-                x[i],
+                x[i] + width / 2,
                max_height + max_height * 0.02,
-                f"{speedup:.2f}x",
+                f"{triton_v2_speedup:.2f}x",
                ha="center",
                va="bottom",
                fontweight="bold",
-                fontsize=9,
+                fontsize=8,
            )

        ax.set_xlabel("Configuration")
@ -395,7 +435,7 @@ def create_combined_plot(all_results):
        ax.grid(True, alpha=0.3)

    plt.tight_layout()
-    filename = "../../silu_bench/silu_benchmark_combined.png"
+    filename = "silu_benchmark_combined_3way.png"
    plt.savefig(filename, dpi=300, bbox_inches="tight")
    plt.show()

@ -405,7 +445,9 @@ def create_combined_plot(all_results):
 outer_dim = 7168
 configs = [
    # DeepSeekV3 Configs
+    # (1, 56, 7168),
    (8, 1024, 7168),
+    # (32, 56, 7168),
    # DeepSeekV3 Configs
    (32, 1024, 7168),
    # DeepSeekV3 Configs
@ -417,6 +459,7 @@ num_warmups = 20

 strategy_descriptions = {
    "uniform": "Uniform Random",
+    "random_imbalanced": "Imbalanced Random",
    "max_t": "Even Assignment",
    "first_t": "experts[0] = T, experts[1:] = 0",
 }
@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies):
    print(f"Testing strategy: {strategy_descriptions[strategy]}")
    print(f"{'=' * 60}")

-    # Collect benchmark data for both algorithms
+    # Collect benchmark data for all three algorithms
    config_labels = []
    config_x_axis = []
-    all_cuda_results = []
-    all_baseline_results = []
+    all_silu_v2_results = []
+    all_triton_results = []
    all_ratios = []

    for E, T, H in configs:
-        total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
+        total_tokens_config = []
+        for i in [8, 16, 32, 64, 128, 256, 512]:
+            if i <= T:
+                total_tokens_config.append(i * E)
        config_x_axis.append(total_tokens_config)

-        cuda_results = []
-        baseline_results = []
+        silu_v2_results = []
+        triton_results = []
        ratios = []

        for total_tokens in total_tokens_config:
            config_label = f"E={E},T={T},H={H},TT={total_tokens}"
            config_labels.append(config_label)

-            # CUDA kernel results
-            time_ms_cuda, gflops, gbps, perc = benchmark(
-                silu_mul_fp8_quant_deep_gemm_cuda,
+            # SiLU V2 (CUDA kernel) results
+            time_ms_silu_v2, gflops, gbps, perc = benchmark(
+                persistent_masked_m_silu_mul_quant,
                E,
                T,
                H,
@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies):
                num_warmups=num_warmups,
                gen_strategy=strategy,
            )
-            cuda_results.append((time_ms_cuda, gflops, gbps, perc))
+            silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))

-            # Baseline results
+            # Triton kernel results
            time_ms_triton, gflops, gbps, perc = benchmark(
                silu_mul_fp8_quant_deep_gemm_triton,
                E,
@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies):
                num_warmups=num_warmups,
                gen_strategy=strategy,
            )
-            baseline_results.append((time_ms_triton, gflops, gbps, perc))
-            ratios.append(time_ms_triton / time_ms_cuda)
+            triton_results.append((time_ms_triton, gflops, gbps, perc))

-            print(f"Completed: {config_label}")
-        all_cuda_results.append(cuda_results)
-        all_baseline_results.append(baseline_results)
+            # Calculate speedup ratios (triton baseline / implementation)
+            triton_v2_ratio = time_ms_triton / time_ms_silu_v2
+            ratios.append(triton_v2_ratio)
+
+            print(
+                f"Completed: {config_label}:"
+                f" V2: {time_ms_silu_v2:.3f}ms,"
+                f" Triton: {time_ms_triton:.3f}ms"
+            )
+
+        all_silu_v2_results.append(silu_v2_results)
+        all_triton_results.append(triton_results)
        all_ratios.append(ratios)

    # Store results for combined plotting
@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies):
        (
            strategy_descriptions[strategy],
            all_ratios,
-            all_cuda_results,
-            all_baseline_results,
+            all_silu_v2_results,
+            all_triton_results,
            config_labels,
            config_x_axis,
        )
@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies):

    # Print summary table for this strategy
    print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
-    print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
-    print("-" * 60)
+    print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
+    print("-" * 90)

    for i, (E, T, H) in enumerate(configs):
-        speedup = baseline_results[i][0] / cuda_results[i][0]
+        # Get the first result for each config (simplifying for summary)
+        v2_time = silu_v2_results[i][0]
+        triton_time = triton_results[i][0]
+        triton_v2_speedup = triton_time / v2_time
        config_label = f"E={E:3d},T={T:4d},H={H:4d}"
        print(
-            f"{config_label:<20} {cuda_results[i][0]:8.5f} "
-            f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
+            f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
+            f"{triton_v2_speedup:8.2f}x"
        )


@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results):
    num_strategies = len(all_results)
    num_configs = len(configs)

-    # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
    fig, axs = plt.subplots(
-        num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
+        num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
    )

    # Add main title to the entire figure
    fig.suptitle(
-        "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
-        fontsize=16,
+        "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
+        fontsize=18,
        fontweight="bold",
        y=0.98,
    )
@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results):
        (
            strategy_name,
            all_ratios,
-            all_cuda_results,
-            all_baseline_results,
+            all_silu_v2_results,
+            all_triton_results,
            config_labels,
            config_x_axis,
        ) = result
@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results):
            ratios = all_ratios[config_idx]
            total_tokens_values = config_x_axis[config_idx]

-            # Extract CUDA and Triton bandwidth percentages
-            cuda_bandwidth_percentages = [
-                result[3] for result in all_cuda_results[config_idx]
+            # Extract speedup ratios
+            triton_v2_ratios = [ratio for ratio in ratios]
+
+            # Extract bandwidth percentages for all implementations
+            v2_bandwidth_percentages = [
+                result[3] for result in all_silu_v2_results[config_idx]
            ]
            triton_bandwidth_percentages = [
-                result[3] for result in all_baseline_results[config_idx]
+                result[3] for result in all_triton_results[config_idx]
            ]

            # Plot speedup ratios vs total tokens (left plot)
            ax_speedup.plot(
-                total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
+                total_tokens_values,
+                triton_v2_ratios,
+                "go-",
+                linewidth=3,
+                markersize=8,
+                label="Triton/V2 Speedup",
            )
            ax_speedup.set_title(
-                f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
+                f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
                fontsize=12,
                fontweight="bold",
            )
            ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
            ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+            ax_speedup.legend(prop={"weight": "bold"})
            ax_speedup.grid(True, alpha=0.3)

+            # Plot bandwidth utilization (right plot)
            ax_bandwidth.plot(
                total_tokens_values,
-                cuda_bandwidth_percentages,
-                "ro-",
+                v2_bandwidth_percentages,
+                "o-",
                linewidth=3,
                markersize=8,
-                label="CUDA",
+                label="SiLU V2",
+                color="blue",
            )
            ax_bandwidth.plot(
                total_tokens_values,
                triton_bandwidth_percentages,
-                "go-",
+                "o-",
                linewidth=3,
                markersize=8,
                label="Triton",
+                color="green",
            )
            ax_bandwidth.set_title(
                f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results):
                for label in ax.get_xticklabels() + ax.get_yticklabels():
                    label.set_fontweight("bold")

-            # Add value labels on speedup points
-            for x, y in zip(total_tokens_values, ratios):
+            # Add value labels on Triton/V2 speedup points
+            for x, y in zip(total_tokens_values, triton_v2_ratios):
                ax_speedup.annotate(
                    f"{y:.2f}x",
                    (x, y),
                    textcoords="offset points",
-                    xytext=(0, 12),
-                    ha="center",
-                    fontsize=10,
-                    fontweight="bold",
-                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
-                )
-
-            # Add value labels on CUDA bandwidth points
-            for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
-                ax_bandwidth.annotate(
-                    f"{y:.1f}%",
-                    (x, y),
-                    textcoords="offset points",
-                    xytext=(0, 12),
-                    ha="center",
-                    fontsize=9,
-                    fontweight="bold",
-                    bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
-                )
-
-            # Add value labels on Triton bandwidth points
-            for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
-                ax_bandwidth.annotate(
-                    f"{y:.1f}%",
-                    (x, y),
-                    textcoords="offset points",
                    xytext=(0, -15),
                    ha="center",
                    fontsize=9,
@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results):

    plt.tight_layout()
    plt.subplots_adjust(top=0.93)  # Make room for main title
-    filename = "silu_benchmark_total_tokens.png"
+    filename = "silu_benchmark_total_tokens_3way.png"
    plt.savefig(filename, dpi=300, bbox_inches="tight")
    plt.show()

    return filename


-# Create combined plot with all strategies
-combined_plot_filename = create_total_tokens_plot(all_results)
+# Create comprehensive 3-way comparison plots
+combined_plot_filename = create_combined_plot(all_results)
+total_tokens_plot_filename = create_total_tokens_plot(all_results)

-print(f"\n{'=' * 60}")
-print("Benchmark Complete!")
-print(f"Generated combined plot: {combined_plot_filename}")
-print(f"{'=' * 60}")
+print(f"\n{'=' * 80}")
+print("3-Way Benchmark Suite Complete!")
+print(f"Generated combined comparison plot: {combined_plot_filename}")
+print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
+print("Compared: SiLU V2 (CUDA), and Triton implementations")
+print(f"{'=' * 80}")
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional

 import flashinfer
 import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_decode(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional

 import flashinfer
 import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_prefill(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -11,13 +11,13 @@ from datetime import datetime
 from typing import Any

 import torch
-import triton
 from tqdm import tqdm

 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    _w8a8_block_fp8_matmul,
+    _w8a8_triton_block_scaled_mm,
 )
 from vllm.platforms import current_platform
+from vllm.triton_utils import triton
 from vllm.utils import FlexibleArgumentParser

 mp.set_start_method("spawn", force=True)
@ -83,7 +83,7 @@ def w8a8_block_matmul(
        )

    if A.dtype == torch.float8_e4m3fn:
-        kernel = _w8a8_block_fp8_matmul
+        kernel = _w8a8_triton_block_scaled_mm
    else:
        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")

--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# fmt: off
 # ruff: noqa: E501
 import time

@ -8,27 +7,33 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    get_col_major_tma_aligned_tensor,
    per_token_group_quant_fp8,
-    w8a8_block_fp8_matmul,
+    w8a8_triton_block_scaled_mm,
 )
 from vllm.triton_utils import triton
-from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
+from vllm.utils.deep_gemm import (
+    calc_diff,
+    fp8_gemm_nt,
+    get_col_major_tma_aligned_tensor,
+    per_block_cast_to_fp8,
+)


-def benchmark_shape(m: int,
-                    n: int,
-                    k: int,
-                    warmup: int = 100,
-                    repeat: int = 10000,
-                    verbose: bool = False) -> dict:
+def benchmark_shape(
+    m: int,
+    n: int,
+    k: int,
+    warmup: int = 100,
+    repeat: int = 10000,
+    verbose: bool = False,
+) -> dict:
    """Benchmark all implementations for a specific (m, n, k) shape."""
    if verbose:
        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")

    # Create test tensors
-    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
-    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)

    # Reference result in BF16
    torch.cuda.synchronize()
@ -45,34 +50,39 @@ def benchmark_shape(m: int,
    # Pre-quantize A for all implementations
    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        A, block_size[1], column_major_scales=True)
+        A, block_size[1], column_major_scales=True
+    )

    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
-        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
-                                       (B_deepgemm, B_scale_deepgemm),
-                                       C_deepgemm)
+        fp8_gemm_nt(
+            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+        )
        return C_deepgemm

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_block_fp8_matmul(A_vllm,
-                                     B_vllm,
-                                     A_scale_vllm,
-                                     B_scale_vllm,
-                                     block_size,
-                                     output_dtype=torch.bfloat16)
+        return w8a8_triton_block_scaled_mm(
+            A_vllm,
+            B_vllm,
+            A_scale_vllm,
+            B_scale_vllm,
+            block_size,
+            output_dtype=torch.bfloat16,
+        )

    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
-        return ops.cutlass_scaled_mm(A_vllm_cutlass,
-                                     B_vllm.T,
-                                     scale_a=A_scale_vllm_cutlass,
-                                     scale_b=B_scale_vllm.T,
-                                     out_dtype=torch.bfloat16)
+        return ops.cutlass_scaled_mm(
+            A_vllm_cutlass,
+            B_vllm.T,
+            scale_a=A_scale_vllm_cutlass,
+            scale_b=B_scale_vllm.T,
+            out_dtype=torch.bfloat16,
+        )

    # Run correctness check first
    if verbose:
@ -89,26 +99,23 @@ def benchmark_shape(m: int,
        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
-        print("vLLM Triton vs DeepGEMM difference: "
-              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
-        print("vLLM CUTLASS vs DeepGEMM difference: "
-              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+        print(
+            "vLLM Triton vs DeepGEMM difference: "
+            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+        )
+        print(
+            "vLLM CUTLASS vs DeepGEMM difference: "
+            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
+        )

    # Benchmark implementations
    implementations = {
        "DeepGEMM": deepgemm_gemm,
        "vLLM Triton": vllm_triton_gemm,
-        "vLLM CUTLASS": vllm_cutlass_gemm
+        "vLLM CUTLASS": vllm_cutlass_gemm,
    }

-    benchmark_results = {
-        "shape": {
-            "m": m,
-            "n": n,
-            "k": k
-        },
-        "implementations": {}
-    }
+    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}

    for name, func in implementations.items():
        # Warmup
@ -136,38 +143,36 @@ def benchmark_shape(m: int,
            "tflops": tflops,
            "gb_s": gb_s,
            "diff": {
-                "DeepGEMM":
-                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
-                "Reference":
-                deepgemm_diff if name == "DeepGEMM" else
-                (vllm_triton_diff
-                 if name == "vLLM Triton" else vllm_cutlass_diff)
-            }
+                "DeepGEMM": 0.0
+                if name == "DeepGEMM"
+                else calc_diff(func(), C_deepgemm),
+                "Reference": deepgemm_diff
+                if name == "DeepGEMM"
+                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+            },
        }

        if verbose:
-            print(
-                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
-            )
+            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")

    # Calculate speedups
    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
    for name, data in benchmark_results["implementations"].items():
        if name != "DeepGEMM":
            speedup = baseline / data["time_ms"]
-            benchmark_results["implementations"][name][
-                "speedup_vs_deepgemm"] = speedup
+            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
            if verbose:
-                print(f"DeepGEMM is {1/speedup:.2f}x "
-                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+                print(
+                    f"DeepGEMM is {1 / speedup:.2f}x "
+                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
+                )

-    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
-        "time_ms"]
-    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
-        "time_ms"]
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
-    benchmark_results["implementations"]["vLLM CUTLASS"][
-        "speedup_vs_triton"] = cutlass_vs_triton
+    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+        cutlass_vs_triton
+    )
    if verbose:
        print(
            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -179,8 +184,7 @@ def benchmark_shape(m: int,

 def format_table_row(values, widths):
    """Format a row with specified column widths."""
-    return "| " + " | ".join(f"{val:{w}}"
-                             for val, w in zip(values, widths)) + " |"
+    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"


 def print_table(headers, rows, title=None):
@ -288,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["DeepGEMM"]
-        deepgemm_rows.append([
-            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
-        ])
+        deepgemm_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+            ]
+        )

-    print_table(deepgemm_headers,
-                deepgemm_rows,
-                title="DeepGEMM Implementation:")
+    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")

    # Print vLLM Triton table
-    triton_headers = [
-        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
-    ]
+    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
    triton_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["vLLM Triton"]
        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
-        triton_rows.append([
-            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-            format_speedup(speedup)
-        ])
+        triton_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+                format_speedup(speedup),
+            ]
+        )

-    print_table(triton_headers,
-                triton_rows,
-                title="vLLM Triton Implementation:")
+    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")

    # Print vLLM CUTLASS table
    cutlass_headers = [
-        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
-        "vs Triton"
+        "m",
+        "n",
+        "k",
+        "Time (μs)",
+        "TFLOPS",
+        "GB/s",
+        "vs DeepGEMM",
+        "vs Triton",
    ]
    cutlass_rows = []
    for result in all_results:
@ -327,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
        impl_data = result["implementations"]["vLLM CUTLASS"]
        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
-        cutlass_rows.append([
-            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-            format_speedup(vs_deepgemm),
-            format_speedup(vs_triton)
-        ])
+        cutlass_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+                format_speedup(vs_deepgemm),
+                format_speedup(vs_triton),
+            ]
+        )

-    print_table(cutlass_headers,
-                cutlass_rows,
-                title="vLLM CUTLASS Implementation:")
+    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")

    # Calculate and print averages
    print("\n===== AVERAGE PERFORMANCE =====")

    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
    avg_metrics = {
-        impl: {
-            "tflops": 0,
-            "gb_s": 0,
-            "time_ms": 0
-        }
-        for impl in implementations
+        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
    }

    for result in all_results:
@ -366,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
-        avg_rows.append([
-            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
-        ])
+        avg_rows.append(
+            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+        )

    print_table(avg_headers, avg_rows)

@ -376,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
    avg_speedups = {
        "DeepGEMM vs vLLM Triton": 0,
        "DeepGEMM vs vLLM CUTLASS": 0,
-        "vLLM CUTLASS vs vLLM Triton": 0
+        "vLLM CUTLASS vs vLLM Triton": 0,
    }

    for result in all_results:
        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
-        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
-            "time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]

-        avg_speedups[
-            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
-        avg_speedups[
-            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
-        avg_speedups[
-            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+            vllm_triton_time / vllm_cutlass_time
+        )

    print("\n===== AVERAGE SPEEDUPS =====")
    speedup_headers = ["Comparison", "Speedup"]
@ -408,8 +421,7 @@ def run_benchmarks(verbose: bool = False):

    for result in all_results:
        for impl in implementations:
-            avg_diff[impl] += result["implementations"][impl]["diff"][
-                "Reference"]
+            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]

    diff_headers = ["Implementation", "Avg Diff vs Reference"]
    diff_rows = []
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import dataclasses
-from collections.abc import Iterable
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Iterable
+from typing import Any

 import torch
 import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:

    def __init__(
        self,
-        cuda_graph_params: Optional[CudaGraphBenchParams],
+        cuda_graph_params: CudaGraphBenchParams | None,
        label: str,
        sub_label: str,
        description: str,
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from statistics import mean
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple

 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
 class UniformDistribution(Distribution):
    def __init__(
        self,
-        min_val: Union[int, float],
-        max_val: Union[int, float],
+        min_val: int | float,
+        max_val: int | float,
        is_integer: bool = True,
    ) -> None:
        self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):


 class ConstantDistribution(Distribution):
-    def __init__(self, value: Union[int, float]) -> None:
+    def __init__(self, value: int | float) -> None:
        self.value = value
        self.max_val = value

@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):


 class ZipfDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
        self.alpha = alpha
        self.max_val = max_val

@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):


 class PoissonDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
        self.alpha = alpha
        self.max_val = max_val

@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
 class LognormalDistribution(Distribution):
    def __init__(
        self,
-        mean: Optional[float] = None,
-        sigma: Optional[float] = None,
-        average: Optional[int] = None,
-        median_ratio: Optional[float] = None,
-        max_val: Optional[int] = None,
+        mean: float | None = None,
+        sigma: float | None = None,
+        average: int | None = None,
+        median_ratio: float | None = None,
+        max_val: int | None = None,
    ) -> None:
        self.average = average
        self.median_ratio = median_ratio
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple

 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):

 class ClientArgs(NamedTuple):
    seed: int
-    max_num_requests: Optional[int]
+    max_num_requests: int | None
    skip_first_turn: bool
-    max_turns: Optional[int]
+    max_turns: int | None
    max_active_conversations: int
    verbose: bool
    print_content: bool
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):

 class MetricStats:
    def __init__(self) -> None:
-        self.min: Optional[float] = None
-        self.max: Optional[float] = None
-        self.avg: Optional[float] = None
+        self.min: float | None = None
+        self.max: float | None = None
+        self.avg: float | None = None
        self.sum = 0.0
        self.count = 0

@ -143,7 +143,7 @@ class MovingAverage:
        self.index = 0
        self.sum = 0.0
        self.count = 0
-        self.avg: Optional[float] = None
+        self.avg: float | None = None

    def update(self, new_value: float) -> None:
        if self.count < self.window_size:
@ -169,7 +169,7 @@ class MovingAverage:
 class DebugStats:
    def __init__(self, logger: logging.Logger, window_size: int) -> None:
        self.logger = logger
-        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
+        self.metrics: dict[str, MovingAverage | MetricStats] = {
            "moving_avg_ttft_ms": MovingAverage(window_size),
            "moving_avg_tpot_ms": MovingAverage(window_size),
            "ttft_ms": MetricStats(),
@ -198,14 +198,6 @@ class DebugStats:
        self.logger.info("-" * 50)


-# Must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix) :]
-    return text
-
-
 def nanosec_to_millisec(value: float) -> float:
    return value / 1000000.0

@ -220,8 +212,8 @@ async def send_request(
    chat_url: str,
    model: str,
    stream: bool = True,
-    min_tokens: Optional[int] = None,
-    max_tokens: Optional[int] = None,
+    min_tokens: int | None = None,
+    max_tokens: int | None = None,
 ) -> ServerResponse:
    payload = {
        "model": model,
@ -250,9 +242,9 @@ async def send_request(
    timeout = aiohttp.ClientTimeout(total=timeout_sec)

    valid_response = True
-    ttft: Optional[float] = None
+    ttft: float | None = None
    chunk_delay: list[int] = []
-    latency: Optional[float] = None
+    latency: float | None = None
    first_chunk = ""
    generated_text = ""

@ -269,7 +261,7 @@ async def send_request(
                if not chunk_bytes:
                    continue

-                chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                if chunk == "[DONE]":
                    # End of stream
                    latency = time.perf_counter_ns() - start_time
@ -364,7 +356,7 @@ async def send_turn(
    req_args: RequestArgs,
    verbose: bool,
    verify_output: bool,
-) -> Optional[RequestStats]:
+) -> RequestStats | None:
    assert messages_to_use > 0
    assert messages_to_use <= len(conversation_messages)

@ -644,7 +636,7 @@ async def client_main(

            if args.verbose:
                curr_time_sec: float = time.perf_counter()
-                time_since_last_turn: Union[str, float] = "N/A"
+                time_since_last_turn: str | float = "N/A"
                if conv_id in time_of_last_turn:
                    time_since_last_turn = round(
                        curr_time_sec - time_of_last_turn[conv_id], 3
@ -769,7 +761,7 @@ def get_client_config(
            "Number of conversations must be equal or larger than the number of clients"
        )

-    max_req_per_client: Optional[int] = None
+    max_req_per_client: int | None = None
    if args.max_num_requests is not None:
        # Max number of requests per client
        req_per_client = args.max_num_requests // args.num_clients
@ -936,13 +928,13 @@ async def main_mp(
                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
                )

-                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
+                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
                if len(client_metrics) < (5 * bench_args.num_clients):
                    # Do not estimate the RPS if the number of samples is very low
                    # (threshold can be tuned if needed)
                    rps = "N/A"

-                runtime_left_sec: Union[str, float] = round(
+                runtime_left_sec: str | float = round(
                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
                )
                if percent < 0.05:
@ -1032,7 +1024,7 @@ def process_statistics(
    warmup_percentages: list[float],
    test_params: dict,
    verbose: bool,
-    gen_conv_args: Optional[GenConvArgs] = None,
+    gen_conv_args: GenConvArgs | None = None,
    excel_output: bool = False,
 ) -> None:
    if len(client_metrics) == 0:
@ -1259,7 +1251,7 @@ async def main() -> None:
        default=None,
        help="The model name used in the API. "
        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
+        "same as the `--model` argument. ",
    )

    parser.add_argument(
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@ -13,7 +13,7 @@ import argparse
 import json
 import random
 from statistics import mean
-from typing import Any, Optional
+from typing import Any

 import pandas as pd  # type: ignore
 import tqdm  # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:


 def content_is_valid(
-    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
+    content: str, min_content_len: int | None, max_content_len: int | None
 ) -> bool:
    if min_content_len and len(content) < min_content_len:
        return False
@ -37,7 +37,7 @@ def content_is_valid(


 def print_stats(
-    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
+    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
 ) -> None:
    # Collect statistics
    stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
    seed: int,
    input_file: str,
    output_file: str,
-    max_items: Optional[int],
-    min_content_len: Optional[int] = None,
-    max_content_len: Optional[int] = None,
-    min_turns: Optional[int] = None,
-    max_turns: Optional[int] = None,
-    model: Optional[str] = None,
+    max_items: int | None,
+    min_content_len: int | None = None,
+    max_content_len: int | None = None,
+    min_turns: int | None = None,
+    max_turns: int | None = None,
+    model: str | None = None,
 ) -> None:
    if min_turns and max_turns:
        assert min_turns <= max_turns
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@ -1,49 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    "I",
-    # flake8-logging-format
-    "G",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-    # f-string format
-    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-]
-
-[tool.ruff.lint.isort]
-known-first-party = ["vllm"]
-
-[tool.ruff.format]
-docstring-code-format = true
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -101,6 +101,7 @@ else()
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
+    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
 endif()

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
@ -177,8 +178,14 @@ elseif (S390_FOUND)
        "-mzvector"
        "-march=native"
        "-mtune=native")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+    if(RVV_FOUND)
+	    message(FAIL_ERROR "Can't support rvv now.")
+    else()
+        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
+    endif()
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()

 #
@ -191,13 +198,24 @@ else()
 endif()

 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
-    FetchContent_Declare(
-        oneDNN
-        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG v3.9
-        GIT_PROGRESS TRUE
-        GIT_SHALLOW TRUE
-    )
+    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
+
+    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
+        message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
+        FetchContent_Declare(
+            oneDNN
+            SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
+        )
+    else()
+        message(STATUS "Downloading oneDNN from GitHub")
+        FetchContent_Declare(
+            oneDNN
+            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+            GIT_TAG v3.9
+            GIT_PROGRESS TRUE
+            GIT_SHALLOW TRUE
+        )
+    endif()

    if(USE_ACL)
        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@ -206,6 +224,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        add_compile_definitions(VLLM_USE_ACL)
    endif()

    set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -258,7 +277,8 @@ set(VLLM_EXT_SRC
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp")
+    "csrc/cpu/torch_bindings.cpp"
+    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
@ -300,4 +320,4 @@ define_gpu_extension_target(
    WITH_SOABI
 )

-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
 else()
  FetchContent_Declare(
        flashmla
-        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
+        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
+        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -33,23 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
-cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+
+set(SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
+    list(APPEND SUPPORT_ARCHS 9.0a)
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
+    list(APPEND SUPPORT_ARCHS 10.0a)
+endif()
+
+
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
+if(FLASH_MLA_ARCHS)
+    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
+
    set(FlashMLA_SOURCES
-        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
+        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
+        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
+        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
+    )
+
+    set(FlashMLA_Extension_SOURCES
+        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+    )

    set(FlashMLA_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc)
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+    )
+
+    set(FlashMLA_Extension_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/sm90
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+    )

    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")

+    set_gencode_flags_for_srcs(
+        SRCS "${FlashMLA_Extension_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+
    define_gpu_extension_target(
        _flashmla_C
        DESTINATION vllm
@ -60,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
        USE_SABI 3
        WITH_SOABI)
+
+    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    target_compile_options(_flashmla_C PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
+
+    define_gpu_extension_target(
+        _flashmla_extension_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${FlashMLA_Extension_SOURCES}
+        COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
+        USE_SABI 3
+        WITH_SOABI)
+
+    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    target_compile_options(_flashmla_extension_C PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create an empty target for setup.py when not targeting sm90a systems
+    # Create empty targets for setup.py when not targeting sm90a systems
    add_custom_target(_flashmla_C)
+    add_custom_target(_flashmla_extension_C)
 endif()

--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@ -0,0 +1,97 @@
+include(FetchContent)
+
+set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
+
+if(DEFINED ENV{QUTLASS_SRC_DIR})
+  set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
+endif()
+
+if(QUTLASS_SRC_DIR)
+  FetchContent_Declare(
+    qutlass
+    SOURCE_DIR ${QUTLASS_SRC_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+else()
+  FetchContent_Declare(
+    qutlass
+    GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
+    GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
+    GIT_PROGRESS TRUE
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+endif()
+
+FetchContent_Populate(qutlass)
+
+if(NOT qutlass_SOURCE_DIR)
+  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
+endif()
+message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
+
+cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
+
+  if(QUTLASS_ARCHS MATCHES "10\\.0a")
+    set(QUTLASS_TARGET_CC 100)
+  elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
+    set(QUTLASS_TARGET_CC 120)
+  else()
+    message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
+  endif()
+
+  set(QUTLASS_SOURCES
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
+  )
+
+  set(QUTLASS_INCLUDES
+    ${qutlass_SOURCE_DIR}
+    ${qutlass_SOURCE_DIR}/qutlass
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/include
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
+  )
+
+  if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
+    list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
+  elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
+    list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
+    message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
+  else()
+    message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
+                        "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
+  endif()
+
+  set_gencode_flags_for_srcs(
+    SRCS "${QUTLASS_SOURCES}"
+    CUDA_ARCHS "${QUTLASS_ARCHS}"
+  )
+
+  target_sources(_C PRIVATE ${QUTLASS_SOURCES})
+  target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
+  target_compile_definitions(_C PRIVATE
+    QUTLASS_DISABLE_PYBIND=1
+    TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
+  )
+
+  set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
+  )
+
+else()
+  if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
+    message(STATUS
+      "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
+  else()
+    message(STATUS
+      "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
+      "CUDA_ARCHS='${CUDA_ARCHS}'.")
+  endif()
+endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
+          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -16,7 +16,7 @@ import shutil

 from torch.utils.hipify.hipify_python import hipify

-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Project directory where all the source + include files live.
@ -34,15 +34,14 @@ if __name__ == '__main__':
    )

    # Source files to convert.
-    parser.add_argument("sources",
-                        help="Source files to hipify.",
-                        nargs="*",
-                        default=[])
+    parser.add_argument(
+        "sources", help="Source files to hipify.", nargs="*", default=[]
+    )

    args = parser.parse_args()

    # Limit include scope to project_dir only
-    includes = [os.path.join(args.project_dir, '*')]
+    includes = [os.path.join(args.project_dir, "*")]

    # Get absolute path for all source files.
    extra_files = [os.path.abspath(s) for s in args.sources]
@ -51,25 +50,31 @@ if __name__ == '__main__':
    # The directory might already exist to hold object files so we ignore that.
    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)

-    hipify_result = hipify(project_directory=args.project_dir,
-                           output_directory=args.output_dir,
-                           header_include_dirs=[],
-                           includes=includes,
-                           extra_files=extra_files,
-                           show_detailed=True,
-                           is_pytorch_extension=True,
-                           hipify_extra_files_only=True)
+    hipify_result = hipify(
+        project_directory=args.project_dir,
+        output_directory=args.output_dir,
+        header_include_dirs=[],
+        includes=includes,
+        extra_files=extra_files,
+        show_detailed=True,
+        is_pytorch_extension=True,
+        hipify_extra_files_only=True,
+    )

    hipified_sources = []
    for source in args.sources:
        s_abs = os.path.abspath(source)
-        hipified_s_abs = (hipify_result[s_abs].hipified_path if
-                          (s_abs in hipify_result
-                           and hipify_result[s_abs].hipified_path is not None)
-                          else s_abs)
+        hipified_s_abs = (
+            hipify_result[s_abs].hipified_path
+            if (
+                s_abs in hipify_result
+                and hipify_result[s_abs].hipified_path is not None
+            )
+            else s_abs
+        )
        hipified_sources.append(hipified_s_abs)

-    assert (len(hipified_sources) == len(args.sources))
+    assert len(hipified_sources) == len(args.sources)

    # Print hipified source files.
    print("\n".join(hipified_sources))
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
  list(REMOVE_DUPLICATES _PTX_ARCHS)
  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)

-  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
-  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
+  # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
  set(_CUDA_ARCHS)
  foreach(_arch ${_SRC_CUDA_ARCHS})
-    if(_arch MATCHES "\\a$")
+    if(_arch MATCHES "[af]$")
      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
-      string(REPLACE "a" "" _base "${_arch}")
+      string(REGEX REPLACE "[af]$" "" _base "${_arch}")
      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
        list(APPEND _CUDA_ARCHS "${_arch}")
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,12 @@
+codecov:
+  require_ci_to_pass: false
+
+fixes:
+  # Map source code paths to repository root paths
+  # Wildcards match any Python version (python3.*)
+  - "/vllm-workspace/src/vllm/::vllm/"
+  - "/vllm-workspace/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@ -28,10 +28,10 @@

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
-  #include "../quantization/fp8/amd/quant_utils.cuh"
+  #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
 typedef __hip_bfloat16 __nv_bfloat16;
 #else
-  #include "../quantization/fp8/nvidia/quant_utils.cuh"
+  #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
 #endif

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -125,32 +125,37 @@ public:
  }

  static void set_split_kv (KernelArguments& args) {
-    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
-    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
-    int max_splits = ceil_div(K, 128);
-    max_splits = min(16, max_splits);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
+    int max_splits = 1;

-    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 4 kv_splits. 
-    // Discuss with NVIDIA how this can be fixed.
-    if (B > 1) {
-      max_splits = min(2, max_splits);
+    if (B <= 4 && seq_length_k >= 16) {
+      max_splits = 16;
    }
-    
-    // printf("    max_splits = %d\n", max_splits);
+    else if (B <= 8 && seq_length_k >= 4) {
+      max_splits = 8;
+    }
+    else if ((B <= 16 && seq_length_k >= 8) ||
+             (B == 48 && seq_length_k >= 32)) {
+      max_splits = 4;
+    }
+    else if ((B <= 32 && seq_length_k >= 16) ||
+             (B == 96 && seq_length_k >= 16)) {
+      max_splits = 2;
+    }
+    else {
+      max_splits = 1;
+    }
+
+    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
-    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
-    // printf("    args.split_kv = %d\n", args.split_kv);
-
  }

  /// Determines whether the GEMM can execute the given problem.
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
        auto blk_coord = tile_scheduler.get_block_coord();
        auto problem_shape = params.problem_shape;
-	auto local_split_kv = params.split_kv;
+        auto local_split_kv = params.split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-	  if (params.ptr_split_kv != nullptr) {
+          if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
        }
-	if (local_split_kv <= get<3>(blk_coord))
-	  continue;
+        if (local_split_kv <= get<3>(blk_coord))
+          continue;
        load_page_table(
          blk_coord,
          problem_shape,
          params.mainloop,
          shared_storage.tensors,
          pipeline_page_table, pipeline_pt_producer_state,
-	  local_split_kv
+          local_split_kv
        );
      }
    }
@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        CUTLASS_PRAGMA_NO_UNROLL
        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
          auto blk_coord = tile_scheduler.get_block_coord();
-	  auto problem_shape = params.problem_shape;
-	  auto local_split_kv = params.split_kv;
+          auto problem_shape = params.problem_shape;
+          auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-	    if (params.ptr_split_kv != nullptr) {
+            if (params.ptr_split_kv != nullptr) {
              local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
          }
-	  if (local_split_kv <= get<3>(blk_coord))
+          if (local_split_kv <= get<3>(blk_coord))
            continue;
          load_cpasync(
            blk_coord,
@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
            params.mainloop_params,
            shared_storage.tensors,
            pipeline_load_qk, pipeline_load_qk_producer_state,
-	    local_split_kv,
+            local_split_kv,
            /* must be shared pipe */
            pipeline_page_table, pipeline_pt_consumer_state
          );
@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          CUTLASS_PRAGMA_NO_UNROLL
          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
            auto blk_coord = tile_scheduler.get_block_coord();
-	    auto problem_shape = params.problem_shape;
-	    auto local_split_kv = params.split_kv;
+            auto problem_shape = params.problem_shape;
+            auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-	      if (params.ptr_split_kv != nullptr) {
-	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
-	      }
+              if (params.ptr_split_kv != nullptr) {
+                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+              }
            }
-	    if (local_split_kv <= get<3>(blk_coord))
+            if (local_split_kv <= get<3>(blk_coord))
              continue;
            load_tma</* paged= */ true>(
              blk_coord,
@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
              shared_storage.tensors,
              pipeline_load_qk, pipeline_load_qk_producer_state,
              pipeline_load_qk, pipeline_load_qk_producer_state,
-	      local_split_kv
+              local_split_kv
            );
            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
          }
@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          CUTLASS_PRAGMA_NO_UNROLL
          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
            auto blk_coord = tile_scheduler.get_block_coord();
-	    auto problem_shape = params.problem_shape;
-	    auto local_split_kv = params.split_kv;
+            auto problem_shape = params.problem_shape;
+            auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-	      if (params.ptr_split_kv != nullptr) {
+              if (params.ptr_split_kv != nullptr) {
                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
-	      }
+              }
            }
-	    if (local_split_kv <= get<3>(blk_coord))
+            if (local_split_kv <= get<3>(blk_coord))
              continue;
            load_tma<false>(
              blk_coord,
@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
              shared_storage.tensors,
              pipeline_load_qk, pipeline_load_qk_producer_state,
              pipeline_load_qk, pipeline_load_qk_producer_state,
-	      local_split_kv
+              local_split_kv
            );
            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
          }
@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
          auto blk_coord = tile_scheduler.get_block_coord();
          auto problem_shape = params.problem_shape;
-	  auto local_split_kv = params.split_kv;
+          auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
            if (params.ptr_split_kv != nullptr) {
                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
          }
-	  if (local_split_kv <= get<3>(blk_coord))
+          if (local_split_kv <= get<3>(blk_coord))
            continue;
          mma(blk_coord,
            problem_shape,
@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
            pipeline_mma_s, pipeline_mma_s_producer_state,
            pipeline_p_mma, pipeline_p_mma_consumer_state,
            pipeline_mma_o, pipeline_mma_o_producer_state,
-	    local_split_kv
+            local_split_kv
          );
        }
      }
@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
        auto blk_coord = tile_scheduler.get_block_coord();
        auto problem_shape = params.problem_shape;
-	auto split_kv = params.split_kv;
-	auto local_split_kv = split_kv;
+        auto split_kv = params.split_kv;
+        auto local_split_kv = split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-	  if (params.ptr_split_kv != nullptr) {
+          if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
        }
-	if (local_split_kv <= get<3>(blk_coord))
+        if (local_split_kv <= get<3>(blk_coord))
          continue;
        compute(
          blk_coord,
@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          pipeline_mma_s, pipeline_mma_s_consumer_state,
          pipeline_p_mma, pipeline_p_mma_producer_state,
          pipeline_mma_o, pipeline_mma_o_consumer_state,
-	  local_split_kv
+          local_split_kv
        );
      }

@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      cutlass::arch::NamedBarrier(
          (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
          kNamedBarrierEpilogue
-      ).arrive();
+      ).arrive_and_wait();

      return;
    }
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -56,3 +56,19 @@ void cp_gather_cache(
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
+
+// Indexer K quantization and cache function
+void indexer_k_quant_and_cache(
+    torch::Tensor& k,             // [num_tokens, head_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
+    torch::Tensor& slot_mapping,  // [num_tokens]
+    int64_t quant_block_size,     // quantization block size
+    const std::string& scale_fmt);
+
+// Extract function to gather quantized K cache
+void cp_gather_indexer_k_quant_cache(
+    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,           // [num_tokens, head_dim]
+    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
+    const torch::Tensor& block_table,   // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -9,15 +9,14 @@
 #include "quantization/vectorization_utils.cuh"

 #ifdef USE_ROCM
-  #include "quantization/fp8/amd/quant_utils.cuh"
+  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
 #else
-  #include "quantization/fp8/nvidia/quant_utils.cuh"
+  #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
 #endif

 #include <algorithm>
 #include <cassert>
-#include <map>
-#include <vector>
+#include <cfloat>

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -209,6 +208,20 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,

 namespace vllm {

+// Used to copy/convert one element
+template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
+struct CopyWithScaleOp {
+  float scale;
+
+  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      dst = static_cast<OutT>(src);
+    } else {
+      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+    }
+  }
+};
+
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -224,59 +237,51 @@ __global__ void reshape_and_cache_kernel(
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  if (slot_idx < 0) {
-    // Padding token that should be ignored.
    return;
  }

  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
+  const int h_block_count = head_size / x;  // head_size//x

-  const int n = num_heads * head_size;
-  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    const int64_t src_key_idx = token_idx * key_stride + i;
-    const int64_t src_value_idx = token_idx * value_stride + i;
+  const int h_block_idx = threadIdx.x;
+  if (h_block_idx >= num_heads * h_block_count) {
+    return;
+  }

-    const int head_idx = i / head_size;
-    const int head_offset = i % head_size;
-    const int x_idx = head_offset / x;
-    const int x_offset = head_offset % x;
+  const int head_idx = h_block_idx / h_block_count;
+  const int h_block = h_block_idx % h_block_count;

-    const int64_t tgt_key_idx =
-        block_idx * num_heads * (head_size / x) * block_size * x +
-        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-        block_offset * x + x_offset;
-    const int64_t tgt_value_idx =
-        block_idx * num_heads * head_size * block_size +
-        head_idx * head_size * block_size + head_offset * block_size +
-        block_offset;
-    scalar_t tgt_key = key[src_key_idx];
-    scalar_t tgt_value = value[src_value_idx];
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      key_cache[tgt_key_idx] = tgt_key;
-      value_cache[tgt_value_idx] = tgt_value;
-    } else {
-      key_cache[tgt_key_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
-      value_cache[tgt_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
-    }
+  const scalar_t* __restrict__ key_src =
+      key + token_idx * key_stride + head_idx * head_size + h_block * x;
+  const int64_t src_value_start =
+      token_idx * value_stride + head_idx * head_size + h_block * x;
+
+  cache_t* __restrict__ key_dst =
+      key_cache + block_idx * num_heads * h_block_count * block_size * x +
+      head_idx * h_block_count * block_size * x + h_block * block_size * x +
+      block_offset * x;
+  const int64_t tgt_value_start =
+      block_idx * num_heads * h_block_count * x * block_size +
+      head_idx * h_block_count * x * block_size + h_block * x * block_size +
+      block_offset;
+
+  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
+  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+
+  vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
+
+  const scalar_t* __restrict__ value_src = value + src_value_start;
+  cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
+#pragma unroll
+  for (int i = 0; i < x; i++) {
+    v_op(value_dst[i * block_size], value_src[i]);
  }
 }

-// Used by vectorization_utils to copy/convert one element
-template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
-struct CopyWithScaleOp {
-  float scale;
-
-  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      dst = static_cast<OutT>(src);
-    } else {
-      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
-    }
-  }
-};
-
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -396,6 +401,241 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_ds_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+  const int64_t dst_idx_start =
+      block_idx * block_stride + block_offset * entry_stride;
+
+  // For the NoPE part, each tile of 128 elements is handled by half of one warp
+  // (16 threads). There are 4 total tiles, so 2 warps (64 threads).
+  // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
+  // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
+  // So in total, we use 3 warps (96 threads) per block.
+
+  // Cast kv_cache to 16_bit for RoPE values
+  scalar_t* kv_cache_16bit =
+      reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
+
+  // The last warp handles the RoPE part
+  if (threadIdx.x >= 64) {
+    // Each thread handles two elements of RoPE
+    const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
+    const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
+    // Vectorized load of two 16-bit values, performed as one 32-bit load
+    const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
+    // RoPE values start after the packed 8-bit NoPE values and the
+    // 32-bit scales
+    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
+    // Vectorized store of two 16-bit values, performed as one 32-bit store
+    *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
+    return;
+  }
+
+  // The first two warps handle the NoPE part
+  const int8_t warp_idx = threadIdx.x >> 5;
+  const int8_t lane_idx = threadIdx.x & 31;
+  const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
+
+  // Each thread handles 8 elements of NoPE
+  // Load the NoPE elements for this thread into registers
+  const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
+  // Vectorized load of eight 16-bit values, performed as an int4 load
+  const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
+  const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
+
+  // Max absolute value of this thread's elements
+  float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
+                              fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
+                        fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
+                              fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
+
+  // Warp-level reduction to find the max absolute value in each half-warp
+#pragma unroll
+  for (int offset = 8; offset > 0; offset /= 2) {
+    max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
+  }
+
+  // Compute the scale for the tile
+  float tile_scale = max_abs / 448.f;
+  tile_scale = fmaxf(tile_scale, FLT_MIN);
+
+  // The first lane of each half-warp writes the scale to kv_cache
+  if ((lane_idx == 0) || (lane_idx == 16)) {
+    float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
+    const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
+    kv_cache_32bit[dst_idx] = tile_scale;
+  }
+
+  // Now all threads in the block scale and write their elements
+  // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
+  const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
+
+  uint8_t result[8];
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    result[i] =
+        fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
+            vals[i], tile_scale);
+  }
+
+  // Store as aligned 64-bit writes
+  *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
+      *reinterpret_cast<const uint64_t*>(result);
+}
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void indexer_k_quant_and_cache_kernel(
+    const scalar_t* __restrict__ k,  // [num_tokens, head_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, cache_stride]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int head_dim,                        // dimension of each head
+    const int quant_block_size,                // quantization block size
+    const int cache_block_size,                // cache block size
+    const int cache_stride,  // stride for each token in kv_cache
+    const bool use_ue8m0     // use ue8m0 scale format
+) {
+  constexpr int VEC_SIZE = 4;
+  const int64_t token_idx = blockIdx.x;
+  const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x +
+                                threadIdx.y * blockDim.x + threadIdx.x) *
+                               VEC_SIZE;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  const int64_t block_idx = slot_idx / cache_block_size;
+  const int64_t block_offset = slot_idx % cache_block_size;
+
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0 || (head_dim_idx >= head_dim)) {
+    return;
+  }
+
+  float2 k_val = (reinterpret_cast<const float2*>(
+      k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE];
+  scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val);
+  float amax = 0.0f;
+  for (int i = 0; i < VEC_SIZE; i++) {
+    amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
+  }
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+
+  // Reduced amax
+  for (int mask = 16; mask > 0; mask /= 2) {
+#ifdef USE_ROCM
+    amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask));
+#else
+    amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
+#endif
+  }
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+  float scale = fmaxf(amax, 1e-4) / 448.0f;
+  if (use_ue8m0) {
+    scale = exp2f(ceilf(log2f(scale)));
+  }
+
+  const int64_t dst_offset = block_idx * cache_block_size * cache_stride +
+                             block_offset * head_dim + head_dim_idx;
+  for (int i = 0; i < VEC_SIZE; i++) {
+    kv_cache[dst_offset + i] =
+        fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale);
+  }
+  if (threadIdx.x == 0) {
+    const int64_t dst_scale_idx =
+        block_idx * cache_block_size * cache_stride +
+        cache_block_size * head_dim +
+        (block_offset * head_dim + head_dim_idx) * 4 / quant_block_size;
+    reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale;
+  }
+}
+
+template <int BLOCK_Y_SIZE>
+__global__ void cp_gather_indexer_k_quant_cache_kernel(
+    const char* __restrict__ kv_cache,  // [num_blocks, block_size,
+                                        // cache_stride]
+    char* __restrict__ dst_k,           // [num_tokens, head_dim]
+    char* __restrict__ dst_scale,  // [num_tokens, head_dim / quant_block_size *
+                                   // 4]
+    const int* __restrict__ block_table,  // [batch_size, num_blocks]
+    const int* __restrict__ cu_seq_lens,  // [batch_size + 1]
+    const int batch_size,                 // batch size
+    const int64_t token_stride,           // stride for each token in dst_k
+    const int64_t head_dim,               // dimension of each head
+    const int64_t block_stride,           // stride for each block in kv_cache
+    const int64_t cache_token_stride,     // stride for each token in kv_cache
+    const int64_t cache_block_size,  // num_tokens for each block in kv_cache
+    const int num_blocks,            // number of blocks
+    const int num_tokens,            // number of tokens
+    const int quant_block_size       // quantization block size
+) {
+  constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
+  const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
+  // Find batch index within a block
+  __shared__ int batch_idx[BLOCK_Y_SIZE];
+  for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
+       iter++) {
+    int tid = iter * blockDim.x + threadIdx.x;
+    if (tid < batch_size) {
+      const int seq_start = cu_seq_lens[tid];
+      const int seq_end = cu_seq_lens[tid + 1];
+      if (token_idx >= seq_start && token_idx < seq_end) {
+        batch_idx[threadIdx.y] = tid;
+      }
+    }
+  }
+
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+
+  if (head_idx >= head_dim || token_idx >= num_tokens) {
+    return;
+  }
+  const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
+  const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
+                                    inbatch_seq_idx / cache_block_size];
+  const int64_t src_block_offset = block_idx * block_stride;
+  const int64_t cache_inblock_offset =
+      (inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
+  const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
+  const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
+
+  reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
+      reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
+  ;
+  if (threadIdx.x == 0) {
+    const int64_t src_scale_offset =
+        src_block_offset + cache_block_size * head_dim +
+        cache_inblock_offset * 4 / quant_block_size;
+    reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
+        reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
+  }
+}
+
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@ -431,14 +671,15 @@ void reshape_and_cache(

  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
+  int head_div_x = head_size / x;

  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_size, 512));
+  dim3 block(std::min(num_heads * head_div_x, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
-                             CALL_RESHAPE_AND_CACHE)
+                             CALL_RESHAPE_AND_CACHE);
 }

 // KV_T is the data type of key and value tensors.
@ -509,6 +750,18 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+#define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE)           \
+  vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE>         \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -531,20 +784,43 @@ void concat_and_cache_mla(
  int pe_dim = k_pe.size(1);
  int block_size = kv_cache.size(1);

-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  if (kv_cache_dtype == "fp8_ds_mla") {
+    TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla");
+    TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla");
+    TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(),
+                "kv_cache.size(2) must be 656 bytes for fp8_ds_mla");
+    TORCH_CHECK(kv_c.itemsize() == 2,
+                "kv_c.itemsize() must be 2 for fp8_ds_mla");
+    TORCH_CHECK(k_pe.itemsize() == 2,
+                "k_pe.itemsize() must be 2 for fp8_ds_mla");
+  } else {
+    TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  }

  int kv_c_stride = kv_c.stride(0);
  int k_pe_stride = k_pe.stride(0);
  int block_stride = kv_cache.stride(0);
  int entry_stride = kv_cache.stride(1);

-  dim3 grid(num_tokens);
-  dim3 block(std::min(kv_lora_rank, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                             CALL_CONCAT_AND_CACHE_MLA);
+  if (kv_cache_dtype == "fp8_ds_mla") {
+    dim3 grid(num_tokens);
+    // For the NoPE part, each tile of 128 elements is handled by half of one
+    // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
+    // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
+    // The RoPE part (last 64 elements) is handled by another 1 warp (32
+    // threads). So in total, we use 3 warps (96 threads) per block.
+    dim3 block(96);
+    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                               CALL_CONCAT_AND_CACHE_DS_MLA);
+  } else {
+    dim3 grid(num_tokens);
+    dim3 block(std::min(kv_lora_rank, 512));
+    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                               CALL_CONCAT_AND_CACHE_MLA);
+  }
 }

 namespace vllm {
@ -922,3 +1198,98 @@ void cp_gather_cache(
    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
  }
 }
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \
+  vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(k.data_ptr()),                        \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \
+          cache_block_size, cache_stride, use_ue8m0);
+
+void indexer_k_quant_and_cache(
+    torch::Tensor& k,             // [num_tokens, head_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
+    torch::Tensor& slot_mapping,  // [num_tokens]
+    int64_t quant_block_size,     // quantization block size
+    const std::string& scale_fmt) {
+  int num_tokens = k.size(0);
+  int head_dim = k.size(1);
+  int cache_block_size = kv_cache.size(1);
+  int cache_stride = kv_cache.size(2);
+  bool use_ue8m0 = scale_fmt == "ue8m0";
+
+  TORCH_CHECK(k.device() == kv_cache.device(),
+              "k and kv_cache must be on the same device");
+  TORCH_CHECK(k.device() == slot_mapping.device(),
+              "k and slot_mapping must be on the same device");
+  TORCH_CHECK(head_dim % quant_block_size == 0,
+              "head_dim must be divisible by quant_block_size");
+
+  constexpr int vec_size = 4;
+  dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) /
+                            (quant_block_size * vec_size));
+  dim3 block(32, vec_size);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
+                             CALL_INDEXER_K_QUANT_AND_CACHE);
+}
+
+// Macro to dispatch the kernel based on the data amount.
+#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE)                  \
+  vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE>                \
+      <<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE,               \
+              (head_dim + 8 * vec_size - 1) / (8 * vec_size)),              \
+         dim3(8, BLOCK_Y_SIZE), 0, stream>>>(                               \
+          reinterpret_cast<char*>(kv_cache.data_ptr()),                     \
+          reinterpret_cast<char*>(dst_k.data_ptr()),                        \
+          reinterpret_cast<char*>(dst_scale.data_ptr()),                    \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+          batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0),   \
+          kv_cache.stride(1), kv_cache.size(1), block_table.size(1),        \
+          num_tokens, quant_block_size);
+
+void cp_gather_indexer_k_quant_cache(
+    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,           // [num_tokens, head_dim]
+    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
+    const torch::Tensor& block_table,  // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens   // [batch_size + 1]
+) {
+  int batch_size = block_table.size(0);
+  int num_tokens = dst_k.size(0);
+  int head_dim = dst_k.size(1);
+  int quant_block_size = head_dim * 4 / dst_scale.size(1);
+
+  TORCH_CHECK(kv_cache.device() == dst_k.device(),
+              "kv_cache and dst_k must be on the same device");
+  TORCH_CHECK(kv_cache.device() == dst_scale.device(),
+              "kv_cache and dst_scale must be on the same device");
+  TORCH_CHECK(kv_cache.device() == block_table.device(),
+              "kv_cache and block_table must be on the same device");
+  TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
+              "kv_cache and cu_seq_lens must be on the same device");
+  TORCH_CHECK(head_dim % quant_block_size == 0,
+              "head_dim must be divisible by quant_block_size");
+
+  constexpr int vec_size = 16;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (num_tokens < 32) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
+  } else if (num_tokens < 64) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
+  } else if (num_tokens < 128) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
+  } else if (num_tokens < 256) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
+  } else if (num_tokens < 512) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
+  } else {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
+  }
+}
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -0,0 +1,19 @@
+#pragma once
+#include <cstdlib>
+#include <string>
+#include <cctype>
+
+namespace vllm {
+
+// vllm_is_batch_invariant(); returns true
+// if env VLLM_BATCH_INVARIANT=1
+inline bool vllm_is_batch_invariant() {
+  static bool cached = []() {
+    std::string env_key = "VLLM_BATCH_INVARIANT";
+    const char* val = std::getenv(env_key.c_str());
+    return (val && std::atoi(val) != 0) ? 1 : 0;
+  }();
+  return cached;
+}
+
+}  // namespace vllm
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`