[ci] fix wheel names for arm wheels (#24898 )

Signed-off-by: simon-mo <simon.mo@hey.com>
[CI Failure] Fix test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe (#24750 )
2025-10-20 23:03:52 +08:00 · 2025-10-07 13:40:13 -07:00 · 2025-09-13 09:30:00 -07:00 · 2025-09-12 23:03:56 -07:00 · 2025-09-12 23:02:15 -07:00 · 2025-09-12 23:01:49 -07:00
1967 changed files with 165244 additions and 185016 deletions
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -8,7 +8,7 @@ This benchmark aims to:

 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.

-Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)

 ## Setup

--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -368,7 +368,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
        )

    # get markdown tables
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -181,14 +181,18 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -365,7 +365,8 @@ run_serving_tests() {
      continue
    fi

-    server_command="$server_envs vllm serve \
+    server_command="$server_envs python3 \
+      -m vllm.entrypoints.openai.api_server \
      $server_args"

    # run the server
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -0,0 +1,46 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.format]
+docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"

  # Add job to create multi-arch manifest
@ -150,16 +150,11 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
-      - "docker push vllm/vllm-openai:nightly-aarch64"
-      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest push vllm/vllm-openai:nightly"
-      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker push vllm/vllm-openai:nightly"
+      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
@ -168,4 +163,3 @@ steps:
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
-      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -8,41 +8,20 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"

-# Get DockerHub credentials from environment
+# Get DockerHub token from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi

-if [ -z "$DOCKERHUB_USERNAME" ]; then
-    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
-    exit 1
-fi
-
-# Get DockerHub bearer token
-echo "Getting DockerHub bearer token..."
-set +x
-BEARER_TOKEN=$(curl -s -X POST \
-    -H "Content-Type: application/json" \
-    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
-    "https://hub.docker.com/v2/users/login" | jq -r '.token')
-set -x
-
-if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
-    echo "Error: Failed to get DockerHub bearer token"
-    exit 1
-fi
-
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    
    while true; do
-        set +x
-        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
+        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
-        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -64,9 +43,7 @@ delete_tag() {
    echo "Deleting tag: $tag_name"
    
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    set +x
-    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
-    set -x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
    
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

+if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
+  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
+fi
+
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
@ -163,6 +167,12 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -58,11 +58,15 @@ function cpu_tests() {
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

-    pytest -x -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
+    # Note: disable Bart until supports V1
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py

    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"

--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -1,191 +0,0 @@
-#!/bin/bash
-
-# This script build the Ascend NPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Base ubuntu image with basic ascend development libraries and python installed
-VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
-CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
-TEST_RUN_CONFIG_FILE="vllm_test.cfg"
-VLLM_ASCEND_TMP_DIR=
-# Get the test run configuration file from the vllm-ascend repository
-fetch_vllm_test_cfg() {
-    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
-    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
-    cleanup() {
-        rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    }
-    trap cleanup EXIT
-
-    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
-    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
-        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-
-    # If the file already exists locally, just overwrite it
-    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
-    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
-
-    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
-    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
-    rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    trap - EXIT
-}
-
-# Downloads test run configuration file from a remote URL.
-# Loads the configuration into the current script environment.
-get_config() {
-    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
-        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-    source "${TEST_RUN_CONFIG_FILE}"
-    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
-    return 0
-}
-
-# get test running configuration.
-fetch_vllm_test_cfg
-get_config
-# Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
-  exit 1
-fi
-
-image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
-container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
-agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
-echo "agent_idx: ${agent_idx}"
-builder_name="cachebuilder${agent_idx}"
-builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
-
-# Try building the docker image
-cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
-    --progress=plain --load -t ${image_name} -f - .
-FROM ${BASE_IMAGE_NAME}
-
-# Define environments
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
-    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
-    apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install for pytest to make the docker build cache layer always valid
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install pytest>=6.0  modelscope
-
-WORKDIR /workspace/vllm
-
-# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-COPY requirements/common.txt /workspace/vllm/requirements/common.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements/common.txt
-
-COPY . .
-
-# Install vLLM
-RUN --mount=type=cache,target=/root/.cache/pip \
-    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
-    python3 -m pip uninstall -y triton
-
-# Install vllm-ascend
-WORKDIR /workspace
-ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
-ARG VLLM_ASCEND_TAG=main
-RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
-    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
-
-# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r /workspace/vllm-ascend/requirements.txt
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
-    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
-    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV VLLM_USE_MODELSCOPE=True
-
-WORKDIR /workspace/vllm-ascend
-
-CMD ["/bin/bash"]
-
-EOF
-
-# Setup cleanup
-remove_docker_container() {
-  docker rm -f "${container_name}" || true;
-  docker image rm -f "${image_name}" || true;
-  docker system prune -f || true;
-}
-trap remove_docker_container EXIT
-
-# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
-# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
-#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
-parse_and_gen_devices() {
-    local input="$1"
-    local index cards_num
-    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
-        index="${BASH_REMATCH[1]}"
-        cards_num="${BASH_REMATCH[2]}"
-    else
-        echo "parse error" >&2
-        return 1
-    fi
-
-    local devices=""
-    local i=0
-    while (( i < cards_num )); do
-        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
-        ((i++))
-    done
-
-    # trim leading space
-    devices="${devices#"${devices%%[![:space:]]*}"}"
-    # Output devices: assigned to the caller variable
-    printf '%s' "$devices"
-}
-
-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
-
-# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
-# This test checks whether the OOT platform interface is functioning properly in conjunction with
-# the hardware plugin vllm-ascend.
-model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
-docker run \
-    ${devices} \
-    --device /dev/davinci_manager \
-    --device /dev/devmm_svm \
-    --device /dev/hisi_hdc \
-    -v /usr/local/dcmi:/usr/local/dcmi \
-    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
-    --entrypoint="" \
-    --name "${container_name}" \
-    "${image_name}" \
-    bash -c '
-    set -e
-    pytest -v -s tests/e2e/vllm_interface/
-'
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
+    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
+    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -35,15 +35,16 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
-    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
+    pytest -v -s v1/test_utils.py
+    pytest -v -s v1/test_metrics_reader.py
 '
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?

 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -1,59 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Setup script for Prime-RL integration tests
-# This script prepares the environment for running Prime-RL tests with nightly vLLM
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
-PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
-PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
-
-echo "Setting up Prime-RL integration test environment..."
-
-# Clean up any existing Prime-RL directory
-if [ -d "${PRIME_RL_DIR}" ]; then
-    echo "Removing existing Prime-RL directory..."
-    rm -rf "${PRIME_RL_DIR}"
-fi
-
-# Install UV if not available
-if ! command -v uv &> /dev/null; then
-    echo "Installing UV package manager..."
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    source $HOME/.local/bin/env
-fi
-
-# Clone Prime-RL repository at specific branch for reproducible tests
-PRIME_RL_BRANCH="integ-vllm-main"
-echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
-git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
-cd "${PRIME_RL_DIR}"
-
-echo "Setting up UV project environment..."
-export UV_PROJECT_ENVIRONMENT=/usr/local
-ln -s /usr/bin/python3 /usr/local/bin/python
-
-# Remove vllm pin from pyproject.toml
-echo "Removing vllm pin from pyproject.toml..."
-sed -i '/vllm==/d' pyproject.toml
-
-# Sync Prime-RL dependencies
-echo "Installing Prime-RL dependencies..."
-uv sync --inexact && uv sync --inexact --all-extras
-
-# Verify installation
-echo "Verifying installations..."
-uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
-uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
-
-echo "Prime-RL integration test environment setup complete!"
-
-echo "Running Prime-RL integration tests..."
-export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
-uv run pytest -vs tests/integration/test_rl.py -m gpu
-
-echo "Prime-RL integration tests completed!"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -6,28 +6,24 @@
 # to generate the final pipeline yaml file.

 # Documentation
-# label(str): the name of the test. emojis allowed.
-# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
-# fast_check_only(bool): run this test on the fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
-# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
+# label(str): the name of the test. emoji allowed.
+# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
+# fast_check_only(bool): run this test on fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for the test. incompatible with command.
-# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
-# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
-# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
-#     in this case, commands must be specified. the first command runs on the first host, the second
+# commands(list): the list of commands to run for test. incompatbile with command.
+# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
+# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
+# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on first host, the second
 #     command runs on the second host.
-# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
-# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
-#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
-# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.

 # When adding a test
-# - If the test belongs to an existing group, add it there
+# - If the test belong to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
@ -50,28 +46,25 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
-  - tests/multimodal
-  - tests/utils_
-  commands:
-  - pytest -v -s -m 'not cpu_test' multimodal
-  - pytest -v -s utils_
-
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - vllm/
+  - tests/mq_llm_engine
+  - tests/async_engine
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/utils_
+  - tests/worker
  - tests/standalone_tests/lazy_imports.py
  - tests/transformers_utils
-  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s transformers_utils
+  - pytest -v -s multimodal
+  - pytest -v -s utils_ # Utils
+  - pytest -v -s worker # Worker
+  - pytest -v -s transformers_utils # transformers_utils

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@ -91,12 +84,25 @@ steps:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_preemption
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Core Test # 22min
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  fast_check: true
+  source_file_dependencies:
+  - vllm/core
+  - vllm/distributed
+  - tests/core
+  commands:
+  - pytest -v -s core

 - label: Entrypoints Unit Tests # 5min
  timeout_in_minutes: 10
@ -121,9 +127,10 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
@ -161,6 +168,7 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
+  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
@ -168,34 +176,28 @@ steps:
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/distributed
+  - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
+  - tests/v1/test_internal_lb_dp.py
+  - tests/v1/test_hybrid_lb_dp.py
  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_symm_mem_allreduce.py
  commands:
-  # test with torchrun tp=2 and external_dp=2
+  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=2 and pp=2
+  # test with tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=4 and dp=1
-  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2, pp=2 and dp=1
-  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=1 and dp=4 with ep
-  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2 and dp=2 with ep
-  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
-  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@ -228,14 +230,16 @@ steps:
  num_gpus: 2
  source_file_dependencies:
  - vllm/
+  - tests/metrics
  - tests/v1/tracing
  commands:
+  - pytest -v -s metrics
  - "pip install \
      'opentelemetry-sdk>=1.26.0' \
      'opentelemetry-api>=1.26.0' \
      'opentelemetry-exporter-otlp>=1.26.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
-  - pytest -v -s v1/tracing
+  - pytest -v -s tracing

 ##### fast check tests  #####
 #####  1 GPU test  #####
@ -296,34 +300,23 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
+    - pytest -v -s v1/core
    - pytest -v -s v1/executor
-    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/kv_connector/unit
+    - pytest -v -s v1/metrics
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_metrics_reader.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

- label: V1 Test others (CPU) # 5 mins
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  no_gpu: true
-  commands:
-    # split the test to avoid interference
-    - pytest -v -s v1/core
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'cpu_test' v1/metrics
-
-
 - label: Examples Test # 30min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
@ -342,13 +335,12 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -397,13 +389,11 @@ steps:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -477,23 +467,33 @@ steps:
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
-  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba

- label: Model Executor Test # 23min
-  timeout_in_minutes: 35
+- label: Tensorizer Test # 14min
+  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
-  - vllm/model_executor
-  - tests/model_executor
+  - vllm/model_executor/model_loader
+  - tests/tensorizer_loader
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
+    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

+- label: Model Executor Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/model_executor
+  - tests/model_executor
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
@ -527,7 +527,7 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -548,6 +548,15 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

+- label: Encoder Decoder tests # 12min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
+
 - label: OpenAI-Compatible Tool Use # 23 min
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
@ -555,105 +564,43 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/tool_use
+    - tests/mistral_tool_use
  commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - pytest -v -s tool_use
+    - pytest -v -s mistral_tool_use

 #####  models test  #####

- label: Basic Models Tests (Initialization)
-  timeout_in_minutes: 45
+- label: Basic Models Test # 57min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
-  - tests/models/test_initialization.py
+  - tests/models
  commands:
-    # Run a subset of model initialization tests
-    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+    - pytest -v -s models/test_transformers.py
+    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
+    - pytest -v -s models/test_initialization.py

- label: Basic Models Tests (Extra Initialization) %N
+- label: Language Models Test (Standard) # 35min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
-  - vllm/model_executor/models/
-  - tests/models/test_initialization.py
-  commands:
-    # Only when vLLM model source is modified - test initialization of a large
-    # subset of supported models (the complement of the small subset in the above
-    # test.) Also run if model initialization test file is modified
-    - pytest -v -s models/test_initialization.py \
-             -k 'not test_can_initialize_small_subset' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
-
- label: Basic Models Tests (Other)
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_transformers.py
-  - tests/models/test_registry.py
-  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
-
- label: Basic Models Test (Other CPU) # 5min
-  timeout_in_minutes: 10
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_utils.py
-  - tests/models/test_vision.py
-  no_gpu: true
-  commands:
-    - pytest -v -s models/test_utils.py models/test_vision.py
-
- label: Language Models Tests (Standard)
-  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental]
-  torch_nightly: true
-  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
-    # Test standard language models, excluding a subset of slow tests
    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+    - pytest -v -s models/language -m core_model

- label: Language Models Tests (Extra Standard) %N
+- label: Language Models Test (Hybrid) # 35 min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
-  - vllm/model_executor/models/
-  - tests/models/language/pooling/test_embedding.py
-  - tests/models/language/generation/test_common.py
-  - tests/models/language/pooling/test_classification.py
-  commands:
-    # Shard slow subset of standard language models tests. Only run when model
-    # source is modified, or when specified test files are modified
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and slow_test' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
-
- label: Language Models Tests (Hybrid) %N
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
-  torch_nightly: true
-  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
@ -661,12 +608,7 @@ steps:
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    # Shard hybrid language model tests
-    - pytest -v -s models/language/generation \
-                   -m hybrid_model \
-                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-                   --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
+    - pytest -v -s models/language/generation -m hybrid_model

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
@ -787,13 +729,11 @@ steps:
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
-    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-    # Whisper needs spawn method to avoid deadlock
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

 - label: Blackwell Test # 38 min
  timeout_in_minutes: 60
@ -823,7 +763,7 @@ steps:
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@ -835,47 +775,6 @@ steps:
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

- label: Blackwell GPT-OSS Eval
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
-
- label: Blackwell Quantized MoE Test
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - tests/quantization/test_blackwell_moe.py
-  - vllm/model_executor/models/deepseek_v2.py
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/models/llama4.py
-  - vllm/model_executor/layers/fused_moe
-  - vllm/model_executor/layers/quantization/compressed_tensors
-  - vllm/model_executor/layers/quantization/modelopt.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - pytest -s -v tests/quantization/test_blackwell_moe.py
-
- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 75
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
-
 #####  1 GPU test  #####
 #####  multi gpus test  #####

@ -890,8 +789,6 @@ steps:
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
-  - pytest -v -s distributed/test_shm_buffer.py
-  - pytest -v -s distributed/test_shm_storage.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
@ -918,58 +815,47 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

- label: Distributed Tests (2 GPUs) # 68min
-  timeout_in_minutes: 90
+- label: Distributed Tests (2 GPUs) # 110min
+  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
-  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
-  - vllm/worker/worker_base.py
-  - vllm/v1/engine/
-  - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
-  - tests/compile/test_wrapper.py
+  - vllm/model_executor/models/
  - tests/distributed/
-  - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/distributed
+  - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
-  - tests/v1/shutdown
-  - tests/v1/worker/test_worker_memory_snapshot.py
+  - vllm/v1/engine/
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
-
- label: Distributed Model Tests (2 GPUs) # 37min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/model_executor/model_loader/sharded_state_loader.py
-  - vllm/model_executor/models/
-  - tests/basic_correctness/
-  - tests/model_executor/model_loader/test_sharded_state_loader.py
-  - tests/models/
-  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
+  # test sequence parallel
+  - pytest -v -s distributed/test_sequence_parallel.py
+  # this test fails consistently.
+  # TODO: investigate and fix
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s models/multimodal/generation/test_maverick.py

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
@ -1012,6 +898,7 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
+  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support

 - label: LoRA TP Test (Distributed) # 17 min
  timeout_in_minutes: 30
@ -1085,34 +972,9 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

-##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Qwen MoE EP Test # optional
  gpu: h200
  optional: true
-  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
-
-##### B200 test #####
- label: Distributed Tests (B200) # optional
-  gpu: b200
-  optional: true
-  working_dir: "/vllm-workspace/"
-  num_gpus: 2
-  commands:
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-
-##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
-  timeout_in_minutes: 30
-  optional: true
-  num_gpus: 2
-  working_dir: "/vllm-workspace"
-  source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
-  commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.coveragerc
+++ b/.coveragerc
@ -1,32 +0,0 @@
-[run]
-source = vllm
-omit =
-    */tests/*
-    */test_*
-    */__pycache__/*
-    */build/*
-    */dist/*
-    */vllm.egg-info/*
-    */third_party/*
-    */examples/*
-    */benchmarks/*
-    */docs/*
-
-[report]
-exclude_lines =
-    pragma: no cover
-    def __repr__
-    if self.debug:
-    if settings.DEBUG
-    raise AssertionError
-    raise NotImplementedError
-    if 0:
-    if __name__ == .__main__.:
-    class .*\bProtocol\):
-    @(abc\.)?abstractmethod
-
-[html]
-directory = htmlcov
-
-[xml]
-output = coverage.xml
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,88 +2,72 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
-/vllm/distributed/kv_transfer @NickLucche @ApostaC
+/vllm/distributed/kv_transfer @NickLucche
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/attention @LucasWilkinson
-/vllm/v1/attention/backends/flashinfer.py @mgoin
-/vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
-/vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @heheda12345
 /vllm/v1/kv_cache_interface.py @heheda12345
-/vllm/v1/offloading @ApostaC

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
+/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
-/tests/evals @mgoin
-/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
+/tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @heheda12345
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
-/tests/v1/kv_connector @ApostaC
-/tests/v1/offloading @ApostaC
-
-# Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
-/tests/models/test_transformers.py @hmellor

 # Docs
-/docs/mkdocs @hmellor
-/docs/**/*.yml @hmellor
-/requirements/docs.txt @hmellor
-.readthedocs.yaml @hmellor
+/docs @hmellor
 mkdocs.yaml @hmellor

-# Linting
-.markdownlint.yaml @hmellor
-.pre-commit-config.yaml @hmellor
-/tools/pre_commit @hmellor
-
 # CPU
-/vllm/v1/worker/cpu* @bigPYJ1151
+/vllm/v1/worker/^cpu @bigPYJ1151
 /csrc/cpu @bigPYJ1151
 /vllm/platforms/cpu.py @bigPYJ1151
 /cmake/cpu_extension.cmake @bigPYJ1151
 /docker/Dockerfile.cpu @bigPYJ1151

 # Intel GPU
-/vllm/v1/worker/xpu* @jikunshang
+/vllm/v1/worker/^xpu @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang

@ -117,7 +101,4 @@ mkdocs.yaml @hmellor
 /vllm/v1/worker/tpu* @NickLucche
 /vllm/platforms/tpu.py @NickLucche
 /vllm/v1/sample/tpu @NickLucche
-/vllm/tests/v1/tpu @NickLucche
-
-# KVConnector installation files
-/requirements/kv_connectors.txt @NickLucche
+/vllm/tests/v1/tpu @NickLucche
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -43,6 +43,10 @@ body:
      Any other things you would like to mention.
  validations:
    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -2,7 +2,6 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
-    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@ -15,7 +14,6 @@ pull_request_rules:
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
-    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@ -32,7 +30,6 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@ -49,7 +46,6 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
-    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@ -59,7 +55,6 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@ -75,7 +70,6 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@ -89,7 +83,6 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
-    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@ -101,7 +94,6 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
@ -115,7 +107,6 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@ -130,20 +121,12 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
-      - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
-      - files~=^tests/entrypoints/test_context.py
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
-      - files~=^vllm/entrypoints/harmony_utils.py
-      - files~=^vllm/entrypoints/tool_server.py
-      - files~=^vllm/entrypoints/tool.py
-      - files~=^vllm/entrypoints/context.py
      - title~=(?i)gpt[-_]?oss
-      - title~=(?i)harmony
  actions:
    label:
      add:
@ -152,7 +135,6 @@ pull_request_rules:
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
-    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@ -173,7 +155,6 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@ -183,7 +164,7 @@ pull_request_rules:
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
@ -193,7 +174,6 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@ -209,7 +189,6 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@ -222,7 +201,6 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
-    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@ -238,7 +216,6 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
-    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@ -253,9 +230,9 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
-    - label != stale
    - or:
      - files~=^tests/tool_use/
+      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
@ -272,9 +249,8 @@ pull_request_rules:

 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-    - label != stale
-    - conflict
-    - -closed
+      - conflict
+      - -closed
  actions:
    label:
      add:
@ -288,12 +264,10 @@ pull_request_rules:

 - name: assign reviewer for tensorizer changes
  conditions:
-    - label != stale
-    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
-      - files~=^tests/model_executor/model_loader/tensorizer_loader/
+      - files~=^tests/tensorizer_loader/
  actions:
    assign:
      users:
@ -301,7 +275,6 @@ pull_request_rules:

 - name: assign reviewer for modelopt changes
  conditions:
-    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -316,27 +289,9 @@ pull_request_rules:

 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-    - -conflict
-    - -closed
+      - -conflict
+      - -closed
  actions:
    label:
      remove:
        - needs-rebase
-
- name: label-kv-connector
-  description: Automatically apply kv-connector label
-  conditions:
-    - label != stale
-    - or:
-      - files~=^examples/online_serving/disaggregated[^/]*/.*
-      - files~=^examples/offline_inference/disaggregated[^/]*/.*
-      - files~=^examples/others/lmcache/
-      - files~=^tests/v1/kv_connector/
-      - files~=^vllm/distributed/kv_transfer/
-      - title~=(?i)\bP/?D\b
-      - title~=(?i)NIXL
-      - title~=(?i)LMCache
-  actions:
-    label:
-      add:
-        - kv-connector
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@ -6,8 +6,6 @@ on:
      - opened
      - synchronize
      - reopened
-      - labeled
-      - unlabeled

 jobs:
  bc_lint:
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,16 +6,28 @@ default_stages:
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.13.3
+- repo: https://github.com/google/yapf
+  rev: v0.43.0
  hooks:
-  - id: ruff-check
+  - id: yapf
+    args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.buildkite|benchmarks|build|examples)/.*'
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.7
+  hooks:
+  - id: ruff
    args: [--output-format, github, --fix]
  - id: ruff-format
+    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
  rev: v1.35.5
  hooks:
  - id: typos
+- repo: https://github.com/PyCQA/isort
+  rev: 6.0.1
+  hooks:
+  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v20.1.3
  hooks:
@ -37,7 +49,7 @@ repos:
  rev: 0.6.17
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
@ -48,32 +60,38 @@ repos:
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
-    entry: python tools/pre_commit/mypy.py 0 "local"
+    entry: tools/mypy.sh 0 "local"
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
    stages: [pre-commit] # Don't run in CI
-    <<: &mypy_common
-      language: python
-      types_or: [python, pyi]
-      require_serial: true
-      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
-    entry: python tools/pre_commit/mypy.py 1 "3.9"
-    <<: *mypy_common
+    entry: tools/mypy.sh 1 "3.9"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
-    entry: python tools/pre_commit/mypy.py 1 "3.10"
-    <<: *mypy_common
+    entry: tools/mypy.sh 1 "3.10"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
-    entry: python tools/pre_commit/mypy.py 1 "3.11"
-    <<: *mypy_common
+    entry: tools/mypy.sh 1 "3.11"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
-    entry: python tools/pre_commit/mypy.py 1 "3.12"
-    <<: *mypy_common
+    entry: tools/mypy.sh 1 "3.12"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
@ -137,15 +155,18 @@ repos:
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/pre_commit/check_pickle_imports.py
+    entry: python tools/check_pickle_imports.py
    language: python
    types: [python]
-    additional_dependencies: [regex]
+    pass_filenames: false
+    additional_dependencies: [pathspec, regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/validate_config.py
    language: python
-    additional_dependencies: [regex]
+    types: [python]
+    pass_filenames: true
+    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -13,7 +13,6 @@ build:

 mkdocs:
  configuration: mkdocs.yaml
-  fail_on_warning: true

 # Optionally declare the Python requirements required to build your docs
 python:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,10 +13,6 @@ cmake_minimum_required(VERSION 3.26)
 # cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)

-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@ -37,7 +33,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -86,9 +82,6 @@ find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
-   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
-  set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
-elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
@ -178,25 +171,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

-#
-# Set compression mode for CUDA >=13.x.
-#
-if(VLLM_GPU_LANG STREQUAL "CUDA" AND
-   DEFINED CMAKE_CUDA_COMPILER_VERSION AND
-   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
-  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
-endif()
-
-#
-# Set CUDA include flags for CXX compiler.
-#
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
-  if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
-  endif()
-endif()
-
 #
 # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -282,7 +256,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -317,8 +291,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
+    "csrc/attention/mla/cutlass_mla_entry.cu"
    "csrc/quantization/fp8/per_token_group_quant.cu")

  set_gencode_flags_for_srcs(
@ -451,11 +427,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
@ -485,11 +457,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -569,11 +537,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
  # CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -592,11 +556,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # FP4 Archs and flags
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -618,13 +578,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # CUTLASS MLA Archs and flags
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
+      "csrc/attention/mla/cutlass_mla_kernels.cu"
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -666,11 +623,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
@ -691,11 +644,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # moe_data.cu is used by all CUTLASS MoE kernels.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
@ -714,11 +663,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
@ -834,17 +779,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
-  if(HADACORE_ARCHS)
-    set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${HADACORE_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    message(STATUS "Building hadacore")
-  endif()
-
 # if CUDA endif
 endif()

--- a/README.md
+++ b/README.md
@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio

 *Latest News* 🔥

- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@ -82,7 +81,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
 - Prefix caching support
 - Multi-LoRA support

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -1,20 +1,874 @@
-# Benchmarks
+# Benchmarking vLLM

-This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
+This README guides you through running benchmark tests with the extensive
+datasets supported on vLLM. It’s a living document, updated as new features and datasets
+become available.

-## Contents
+## Dataset Overview

- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
+<table style="width:100%; border-collapse: collapse;">
+  <thead>
+    <tr>
+      <th style="width:15%; text-align: left;">Dataset</th>
+      <th style="width:10%; text-align: center;">Online</th>
+      <th style="width:10%; text-align: center;">Offline</th>
+      <th style="width:65%; text-align: left;">Data Path</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><strong>ShareGPT</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
+    </tr>
+    <tr>
+      <td><strong>ShareGPT4V (Image)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
+        <br>
+        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
+        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
+      </td>
+    </tr>
+        <tr>
+      <td><strong>ShareGPT4Video (Video)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
+      </td>
+    </tr>
+    <tr>
+      <td><strong>BurstGPT</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
+    </tr>
+    <tr>
+      <td><strong>Sonnet (deprecated)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
+    </tr>
+    <tr>
+      <td><strong>Random</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>synthetic</code></td>
+    </tr>
+    <tr>
+      <td><strong>RandomMultiModal (Image/Video)</strong></td>
+      <td style="text-align: center;">🟡</td>
+      <td style="text-align: center;">🚧</td>
+      <td><code>synthetic</code> </td>
+    </tr>
+    <tr>
+      <td><strong>Prefix Repetition</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>synthetic</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-VisionArena</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmarena-ai/VisionArena-Chat</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-InstructCoder</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+      <tr>
+      <td><strong>HuggingFace-AIMO</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Other</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-MTBench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>philschmid/mt-bench</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Blazedit</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
+    </tr>
+    <tr>
+      <td><strong>Spec Bench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
+    </tr>
+    <tr>
+      <td><strong>Custom</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>Local file: <code>data.jsonl</code></td>
+    </tr>
+  </tbody>
+</table>

-## Usage
+✅: supported

-For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
+🟡: Partial support

-For full CLI reference see:
+🚧: to be supported

- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```
+
+## 🚀 Example - Online Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+First start serving your model
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B
+```
+
+Then run the benchmarking script
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 10
+```
+
+If successful, you will see the following output
+
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     10
+Benchmark duration (s):                  5.78
+Total input tokens:                      1369
+Total generated tokens:                  2212
+Request throughput (req/s):              1.73
+Output token throughput (tok/s):         382.89
+Total Token throughput (tok/s):          619.85
+---------------Time to First Token----------------
+Mean TTFT (ms):                          71.54
+Median TTFT (ms):                        73.88
+P99 TTFT (ms):                           79.49
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          7.91
+Median TPOT (ms):                        7.96
+P99 TPOT (ms):                           8.03
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           7.74
+Median ITL (ms):                         7.70
+P99 ITL (ms):                            8.39
+==================================================
+```
+
+### Custom Dataset
+
+If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
+
+```json
+{"prompt": "What is the capital of India?"}
+{"prompt": "What is the capital of Iran?"}
+{"prompt": "What is the capital of China?"}
+```
+
+```bash
+# start server
+VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
+```
+
+```bash
+# run benchmarking script
+vllm bench serve --port 9001 --save-result --save-detailed \
+  --backend vllm \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --endpoint /v1/completions \
+  --dataset-name custom \
+  --dataset-path <path-to-your-data-jsonl> \
+  --custom-skip-chat-template \
+  --num-prompts 80 \
+  --max-concurrency 1 \
+  --temperature=0.3 \
+  --top-p=0.75 \
+  --result-dir "./log/"
+```
+
+You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
+
+### VisionArena Benchmark for Vision Language Models
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct
+```
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --endpoint-type openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 1000
+```
+
+### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+``` bash
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name hf \
+    --dataset-path likaixin/InstructCoder \
+    --num-prompts 2048
+```
+
+### Spec Bench Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
+
+Run all categories:
+
+``` bash
+# Download the dataset using:
+# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+```
+
+Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
+
+Run only a specific category like "summarization":
+
+``` bash
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+    --spec-bench-category "summarization"
+```
+
+### Other HuggingFaceDataset Examples
+
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct
+```
+
+`lmms-lab/LLaVA-OneVision-Data`:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --endpoint-type openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+`Aeala/ShareGPT_Vicuna_unfiltered`:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --endpoint-type openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+`AI-MO/aimo-validation-aime`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
+`philschmid/mt-bench`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path philschmid/mt-bench \
+    --num-prompts 80
+```
+
+`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path vdaita/edit_5k_char \
+    --num-prompts 90 \
+    --blazedit-min-distance 0.01 \
+    --blazedit-max-distance 0.99
+```
+
+### Running With Sampling Parameters
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+vllm bench serve \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
+```
+
+### Running With Ramp-Up Request Rate
+
+The benchmark tool also supports ramping up the request rate over the
+duration of the benchmark run. This can be useful for stress testing the
+server or finding the maximum throughput that it can handle, given some latency budget.
+
+Two ramp-up strategies are supported:
+
+- `linear`: Increases the request rate linearly from a start value to an end value.
+- `exponential`: Increases the request rate exponentially.
+
+The following arguments can be used to control the ramp-up:
+
+- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
+- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
+- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
+
+</details>
+
+## 📈 Example - Offline Throughput Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+```bash
+vllm bench throughput \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset-name sonnet \
+  --dataset-path vllm/benchmarks/sonnet.txt \
+  --num-prompts 10
+```
+
+If successful, you will see the following output
+
+```text
+Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Total num prompt tokens:  5014
+Total num output tokens:  1500
+```
+
+### VisionArena Benchmark for Vision Language Models
+
+```bash
+vllm bench throughput \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --num-prompts 1000 \
+  --hf-split train
+```
+
+The `num prompt tokens` now includes image token counts
+
+```text
+Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
+Total num prompt tokens:  14527
+Total num output tokens:  1280
+```
+
+### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+VLLM_USE_V1=1 \
+vllm bench throughput \
+    --dataset-name=hf \
+    --dataset-path=likaixin/InstructCoder \
+    --model=meta-llama/Meta-Llama-3-8B-Instruct \
+    --input-len=1000 \
+    --output-len=100 \
+    --num-prompts=2048 \
+    --async-engine \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+```text
+Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
+Total num prompt tokens:  261136
+Total num output tokens:  204800
+```
+
+### Other HuggingFaceDataset Examples
+
+`lmms-lab/LLaVA-OneVision-Data`:
+
+```bash
+vllm bench throughput \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+`Aeala/ShareGPT_Vicuna_unfiltered`:
+
+```bash
+vllm bench throughput \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+`AI-MO/aimo-validation-aime`:
+
+```bash
+vllm bench throughput \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
+Benchmark with LoRA adapters:
+
+``` bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench throughput \
+  --model meta-llama/Llama-2-7b-hf \
+  --backend vllm \
+  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --dataset_name sharegpt \
+  --num-prompts 10 \
+  --max-loras 2 \
+  --max-lora-rank 8 \
+  --enable-lora \
+  --lora-path yard1/llama-2-7b-sql-lora-test
+  ```
+
+</details>
+
+## 🛠️ Example - Structured Output Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of structured output generation (JSON, grammar, regex).
+
+### Server Setup
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B
+```
+
+### JSON Schema Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset json \
+  --structured-output-ratio 1.0 \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Grammar-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset grammar \
+  --structure-type grammar \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Regex-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset regex \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Choice-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset choice \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### XGrammar Benchmark Dataset
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset xgrammar_bench \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+</details>
+
+## 📚 Example - Long Document QA Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of long document question-answering with prefix caching.
+
+### Basic Long Document QA Test
+
+```bash
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 16 \
+  --document-length 2000 \
+  --output-len 50 \
+  --repeat-count 5
+```
+
+### Different Repeat Modes
+
+```bash
+# Random mode (default) - shuffle prompts randomly
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode random
+
+# Tile mode - repeat entire prompt list in sequence
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode tile
+
+# Interleave mode - repeat each prompt consecutively
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode interleave
+```
+
+</details>
+
+## 🗂️ Example - Prefix Caching Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the efficiency of automatic prefix caching.
+
+### Fixed Prompt with Prefix Caching
+
+```bash
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-prompts 1 \
+  --repeat-count 100 \
+  --input-length-range 128:256
+```
+
+### ShareGPT Dataset with Prefix Caching
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --enable-prefix-caching \
+  --num-prompts 20 \
+  --repeat-count 5 \
+  --input-length-range 128:256
+```
+
+### Prefix Repetition Dataset
+
+```bash
+vllm bench serve \
+  --backend openai \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-name prefix_repetition \
+  --num-prompts 100 \
+  --prefix-repetition-prefix-len 512 \
+  --prefix-repetition-suffix-len 128 \
+  --prefix-repetition-num-prefixes 5 \
+  --prefix-repetition-output-len 128
+```
+
+</details>
+
+## ⚡ Example - Request Prioritization Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of request prioritization in vLLM.
+
+### Basic Prioritization Test
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority
+```
+
+### Multiple Sequences per Prompt
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority \
+  --n 2
+```
+
+</details>
+
+## 👁️ Example - Multi-Modal Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of multi-modal requests in vLLM.
+
+### Images (ShareGPT4V)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"image": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4v/images
+```
+
+Send requests with images:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+### Videos (ShareGPT4Video)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"video": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4video/videos
+```
+
+Send requests with videos:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+### Synthetic Random Images (random-mm)
+
+Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
+
+Notes:
+
+- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
+- Video sampling is not yet implemented.
+
+Start the server (example):
+
+```bash
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
+  --dtype bfloat16 \
+  --max-model-len 16384 \
+  --limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --mm-processor-kwargs max_pixels=1003520
+```
+
+Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
+
+Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-3B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name random-mm \
+  --num-prompts 100 \
+  --max-concurrency 10 \
+  --random-prefix-len 25 \
+  --random-input-len 300 \
+  --random-output-len 40 \
+  --random-range-ratio 0.2 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
+  --request-rate inf \
+  --ignore-eos \
+  --seed 42
+```
+
+The number of items per request can be controlled by passing multiple image buckets:
+
+```bash
+  --random-mm-base-items-per-request 2 \
+  --random-mm-num-mm-items-range-ratio 0.5 \
+  --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
+  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
+```
+
+Flags specific to `random-mm`:
+
+- `--random-mm-base-items-per-request`: base number of multimodal items per request.
+- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
+- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
+- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
+
+Behavioral notes:
+
+- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
+
+How sampling works:
+
+- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
+- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
+- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
+This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
+- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
+
+</details>
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -149,70 +149,3 @@ The script follows a systematic process to find the optimal parameters:
 4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.

 5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
-
-## Batched `auto_tune`
-
-The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
-
-### Prerequisites
-
- **jq**: This script requires `jq` to parse the JSON configuration file.
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
-
-### How to Run
-
-1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
-
-2. **Execute the script**:
-
-    ```bash
-    bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
-    ```
-
-    - `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
-    - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
-
-### Configuration File
-
-The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
-
-Here is an example `runs_config.json` with two benchmark configurations:
-
-```json
-[
-  {
-    "base": "/home/user",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "system": "TPU", # OR GPU
-    "tp": 8,
-    "input_len": 128,
-    "output_len": 2048,
-    "max_model_len": 2300,
-    "num_seqs_list": "128 256",
-    "num_batched_tokens_list": "8192 16384"
-  },
-  {
-    "base": "/home/user",
-    "model": "meta-llama/Llama-3.1-70B-Instruct",
-    "system": "TPU", # OR GPU
-    "tp": 8,
-    "input_len": 4000,
-    "output_len": 16,
-    "max_model_len": 4096,
-    "num_seqs_list": "64 128",
-    "num_batched_tokens_list": "4096 8192",
-    "max_latency_allowed_ms": 500
-  }
-]
-```
-
-### Output
-
-The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
-
- `run_id`: A unique identifier for the run, derived from the timestamp.
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
-
-A summary of successful and failed runs is also printed to the console upon completion.
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -103,15 +103,10 @@ start_server() {
        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
-    local server_pid=$!

    # wait for 10 minutes...
    server_started=0
    for i in {1..60}; do
-        # This line checks whether the server is still alive or not,
-        # since that we should always have permission to send signal to the server process.
-        kill -0 $server_pid 2> /dev/null || break
-
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -123,7 +118,7 @@ start_server() {
    done

    if (( ! server_started )); then
-        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
        return 1
    else
        return 0
--- a/benchmarks/auto_tune/batch_auto_tune.sh
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
@ -1,128 +0,0 @@
-#!/bin/bash
-
-INPUT_JSON="$1"
-GCS_PATH="$2" # Optional GCS path for uploading results for each run
-
-SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
-AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
-
-if [[ -z "$INPUT_JSON" ]]; then
-  echo "Error: Input JSON file not provided."
-  echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
-  exit 1
-fi
-
-if [[ ! -f "$INPUT_JSON" ]]; then
-  echo "Error: File not found at '$INPUT_JSON'"
-  exit 1
-fi
-
-if ! command -v jq &> /dev/null; then
-    echo "Error: 'jq' command not found. Please install jq to process the JSON input."
-    exit 1
-fi
-
-if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
-    echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
-    exit 1
-fi
-
-SUCCESS_COUNT=0
-FAILURE_COUNT=0
-FAILED_RUNS=()
-SCRIPT_START_TIME=$(date +%s)
-
-json_content=$(cat "$INPUT_JSON")
-if ! num_runs=$(echo "$json_content" | jq 'length'); then
-  echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
-  exit 1
-fi
-
-echo "Found $num_runs benchmark configurations in $INPUT_JSON."
-echo "Starting benchmark runs..."
-echo "--------------------------------------------------"
-
-for i in $(seq 0 $(($num_runs - 1))); do
-  run_object=$(echo "$json_content" | jq ".[$i]")
-
-  RUN_START_TIME=$(date +%s)
-  ENV_VARS_ARRAY=()
-  # Dynamically create env vars from the JSON object's keys
-  for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
-    value=$(echo "$run_object" | jq -r ".$key")
-    var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
-    ENV_VARS_ARRAY+=("${var_name}=${value}")
-  done
-
-  echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
-
-  # Execute auto_tune.sh and capture output
-  RUN_OUTPUT_FILE=$(mktemp)
-  if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
-    STATUS="SUCCESS"
-    ((SUCCESS_COUNT++))
-  else
-    STATUS="FAILURE"
-    ((FAILURE_COUNT++))
-    FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
-  fi
-
-  RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
-  rm "$RUN_OUTPUT_FILE"
-
-  # Parse results and optionally upload them to GCS
-  RUN_ID=""
-  RESULTS=""
-  GCS_RESULTS_URL=""
-  if [[ "$STATUS" == "SUCCESS" ]]; then
-    RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
-
-    if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
-      RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
-      RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
-      RESULTS=$(cat "$RESULT_FILE_PATH")
-
-      if [[ -n "$GCS_PATH" ]]; then
-        GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
-        echo "Uploading results to GCS..."
-        if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
-          echo "GCS upload successful."
-        else
-          echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
-        fi
-      fi
-    else
-      echo "Warning: Could not find result file for a successful run."
-      STATUS="WARNING_NO_RESULT_FILE"
-    fi
-  fi
-
-  # Add the results back into the JSON object for this run
-  json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
-    '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
-
-  RUN_END_TIME=$(date +%s)
-  echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
-  echo "--------------------------------------------------"
-
-  # Save intermediate progress back to the file
-  echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
-
-done
-
-SCRIPT_END_TIME=$(date +%s)
-echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
-echo
-echo "====================== SUMMARY ======================"
-echo "Successful runs: $SUCCESS_COUNT"
-echo "Failed runs:     $FAILURE_COUNT"
-echo "==================================================="
-
-if [[ $FAILURE_COUNT -gt 0 ]]; then
-  echo "Details of failed runs (see JSON file for full parameters):"
-  for failed in "${FAILED_RUNS[@]}"; do
-    echo "  - $failed"
-  done
-fi
-
-echo "Updated results have been saved to '$INPUT_JSON'."
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc

-from benchmark_utils import TimeCollector
 from tabulate import tabulate

+from benchmark_utils import TimeCollector
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool

--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -1,31 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-import time
-from unittest import mock

 import numpy as np
-from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from vllm.config import (
-    CacheConfig,
-    DeviceConfig,
-    LoadConfig,
-    ModelConfig,
-    ParallelConfig,
-    SchedulerConfig,
-    SpeculativeConfig,
-    VllmConfig,
-)
-from vllm.platforms import current_platform
+from benchmark_utils import TimeCollector
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.worker.gpu_input_batch import InputBatch
-from vllm.v1.worker.gpu_model_runner import GPUModelRunner


-def benchmark_propose(args):
+def main(args):
    rows = []
    for max_ngram in args.max_ngram:
        collector = TimeCollector(TimeCollector.US)
@ -83,88 +69,10 @@ def benchmark_propose(args):
    )


-def benchmark_batched_propose(args):
-    NUM_SPECULATIVE_TOKENS_NGRAM = 10
-    PROMPT_LOOKUP_MIN = 5
-    PROMPT_LOOKUP_MAX = 15
-    MAX_MODEL_LEN = int(1e7)
-    DEVICE = current_platform.device_type
-
-    model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
-
-    speculative_config = SpeculativeConfig(
-        target_model_config=model_config,
-        target_parallel_config=ParallelConfig(),
-        method="ngram",
-        num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
-        prompt_lookup_max=PROMPT_LOOKUP_MAX,
-        prompt_lookup_min=PROMPT_LOOKUP_MIN,
-    )
-
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        cache_config=CacheConfig(),
-        speculative_config=speculative_config,
-        device_config=DeviceConfig(device=current_platform.device_type),
-        parallel_config=ParallelConfig(),
-        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
-    )
-
-    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
-    mock_pp_group = mock.MagicMock()
-    mock_pp_group.world_size = 1
-    with mock.patch(
-        "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
-    ):
-        runner = GPUModelRunner(vllm_config, DEVICE)
-
-        # hack max model len
-        runner.max_model_len = MAX_MODEL_LEN
-        runner.drafter.max_model_len = MAX_MODEL_LEN
-
-        dummy_input_batch = InputBatch(
-            max_num_reqs=args.num_req,
-            max_model_len=MAX_MODEL_LEN,
-            max_num_batched_tokens=args.num_req * args.num_token,
-            device=DEVICE,
-            pin_memory=False,
-            vocab_size=256000,
-            block_sizes=[16],
-        )
-        dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
-        dummy_input_batch.spec_decode_unsupported_reqs = ()
-        dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
-        dummy_input_batch.token_ids_cpu = np.random.randint(
-            0, 20, (args.num_req, args.num_token)
-        )
-
-        runner.input_batch = dummy_input_batch
-
-        sampled_token_ids = [[0]] * args.num_req
-
-        print("Starting benchmark")
-        # first run is warmup so ignore it
-        for _ in range(args.num_iteration):
-            start = time.time()
-            runner.drafter.propose(
-                sampled_token_ids,
-                dummy_input_batch.req_ids,
-                dummy_input_batch.num_tokens_no_spec,
-                dummy_input_batch.token_ids_cpu,
-                dummy_input_batch.spec_decode_unsupported_reqs,
-            )
-            end = time.time()
-            print(f"Iteration time (s): {end - start}")
-
-
 def invoke_main() -> None:
    parser = FlexibleArgumentParser(
        description="Benchmark the performance of N-gram speculative decode drafting"
    )
-    parser.add_argument(
-        "--batched", action="store_true", help="consider time to prepare batch"
-    )
    parser.add_argument(
        "--num-iteration",
        type=int,
@ -197,17 +105,8 @@ def invoke_main() -> None:
        help="Number of speculative tokens to generate",
    )
    args = parser.parse_args()
-
-    if not args.batched:
-        benchmark_propose(args)
-    else:
-        benchmark_batched_propose(args)
+    main(args)


-"""
-# Example command lines:
-# time python3 benchmarks/benchmark_ngram_proposer.py
-# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
-"""  # noqa: E501
 if __name__ == "__main__":
    invoke_main()  # pragma: no cover
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -37,13 +37,14 @@ from typing import Optional
 import datasets
 import numpy as np
 import pandas as pd
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
    RequestFuncInput,
    RequestFuncOutput,
 )
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase

 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
@ -448,8 +449,7 @@ async def benchmark(
    def prepare_extra_body(request) -> dict:
        extra_body = {}
        # Add the schema to the extra_body
-        extra_body["structured_outputs"] = {}
-        extra_body["structured_outputs"][request.structure_type] = request.schema
+        extra_body[request.structure_type] = request.schema
        return extra_body

    print("Starting initial single prompt test run...")
@ -696,11 +696,11 @@ def evaluate(ret, args):
        return re.match(args.regex, actual) is not None

    def _eval_correctness(expected, actual):
-        if args.structure_type == "json":
+        if args.structure_type == "guided_json":
            return _eval_correctness_json(expected, actual)
-        elif args.structure_type == "regex":
+        elif args.structure_type == "guided_regex":
            return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == "choice":
+        elif args.structure_type == "guided_choice":
            return _eval_correctness_choice(expected, actual)
        else:
            return None
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
    )

    if args.dataset == "grammar":
-        args.structure_type = "grammar"
+        args.structure_type = "guided_grammar"
    elif args.dataset == "regex":
-        args.structure_type = "regex"
+        args.structure_type = "guided_regex"
    elif args.dataset == "choice":
-        args.structure_type = "choice"
+        args.structure_type = "guided_choice"
    else:
-        args.structure_type = "json"
+        args.structure_type = "guided_json"

    if args.no_structured_output:
        args.structured_output_ratio = 0
    if args.save_results:
-        result_file_name = f"{args.structured_output_ratio}so"
+        result_file_name = f"{args.structured_output_ratio}guided"
        result_file_name += f"_{backend}"
        result_file_name += f"_{args.request_rate}qps"
        result_file_name += f"_{args.model.split('/')[-1]}"
@ -909,13 +909,13 @@ def create_argument_parser():
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv

@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -55,7 +55,9 @@ benchmark() {
  output_len=$2


-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -38,12 +38,16 @@ wait_for_server() {
 launch_chunked_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
    --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@ -3,7 +3,6 @@
 import argparse
 import copy
 import itertools
-import os

 import torch
 from weight_shapes import WEIGHT_SHAPES
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "nvfp4": dict(no_a_quant=False, enabled=True),
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
-    "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
-    "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
 }

-_needs_fbgemm = any(
-    v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
-)
-if _needs_fbgemm:
-    try:
-        from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
-            triton_scale_nvfp4_quant,
-        )
-    except ImportError:
-        print(
-            "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
-            "These providers will be skipped. Please install fbgemm_gpu with: "
-            "'pip install fbgemm-gpu-genai' to run them."
-        )
-        # Disable FBGEMM providers so the benchmark can run.
-        for cfg in PROVIDER_CFGS.values():
-            if cfg.get("fbgemm"):
-                cfg["enabled"] = False
-
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]


-def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
+def _quant_weight_nvfp4(b: torch.Tensor, device: str):
    # Compute global scale for weight
    b_amax = torch.abs(b).max().to(torch.float32)
    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
-    if "fbgemm" in cfg and cfg["fbgemm"]:
-        b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
-    else:
-        b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
+    b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
    return b_fp4, scale_b_fp4, b_global_scale


 def build_nvfp4_runner(cfg, a, b, dtype, device):
-    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
+    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)

    # Compute global scale for activation
    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):

    # Alpha for the GEMM operation
    alpha = 1.0 / (a_global_scale * b_global_scale)
-    if "fbgemm" in cfg and cfg["fbgemm"]:
-        if cfg["no_a_quant"]:
-            a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
-
-            def run():
-                return torch.ops.fbgemm.f4f4bf16(
-                    a_fp4,
-                    b_fp4,
-                    scale_a_fp4,
-                    scale_b_fp4,
-                    global_scale=alpha,
-                    use_mx=False,
-                )
-
-            return run
-        else:
-
-            def run():
-                a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
-                return torch.ops.fbgemm.f4f4bf16(
-                    a_fp4,
-                    b_fp4,
-                    scale_a_fp4,
-                    scale_b_fp4,
-                    global_scale=alpha,
-                    use_mx=False,
-                )
-
-            return run

    if cfg["no_a_quant"]:
        # Pre-quantize activation
@ -184,13 +130,10 @@ if __name__ == "__main__":

    for K, N, model in prepare_shapes(args):
        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
-        save_dir = f"bench_nvfp4_res_n{N}_k{K}"
-        os.makedirs(save_dir, exist_ok=True)
-
        benchmark.run(
            print_data=True,
            show_plots=True,
-            save_path=save_dir,
+            save_path=f"bench_nvfp4_res_n{N}_k{K}",
            N=N,
            K=K,
        )
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -2,25 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from typing import Callable
-from unittest.mock import patch

-import pandas as pd
 import torch

+from vllm import _custom_ops as ops
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
-
-
-def with_triton_mode(fn):
-    """Temporarily force the Triton fallback path"""
-
-    def wrapped(*args, **kwargs):
-        with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
-            return fn(*args, **kwargs)
-
-    return wrapped


 # TODO(luka): use standalone_compile utility
@ -32,238 +21,78 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
    return inner


-def bench_compile(fn: Callable):
-    # recompile for different shapes
-    fwd = torch.compile(fn, fullgraph=True, dynamic=False)
+torch._dynamo.config.recompile_limit = 8888
+compilation_config = CompilationConfig(custom_ops=["none"])
+with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
+    torch_per_token_quant_fp8 = torch.compile(
+        QuantFP8(False, GroupShape.PER_TOKEN),
+        fullgraph=True,
+        dynamic=False,  # recompile for different shapes
+    )

    # First dim is explicitly dynamic to simulate vLLM usage
-    return with_dyn_arg(fwd, 0, 0)
+    torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)


-torch._dynamo.config.recompile_limit = 8888
+def cuda_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return ops.scaled_fp8_quant(input)


-def calculate_diff(
-    batch_size: int,
-    hidden_size: int,
-    group_shape: GroupShape,
-    dtype: torch.dtype,
-):
-    """Calculate the difference between Inductor and CUDA implementations."""
+def calculate_diff(batch_size: int, seq_len: int):
+    """Calculate difference between Triton and CUDA implementations."""
    device = torch.device("cuda")
-    x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
+    x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)

-    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
+    torch_out, torch_scale = torch_per_token_quant_fp8(x)
+    cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)

-    torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
-    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
-    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
-
-    try:
-        torch.testing.assert_close(
-            cuda_out.to(torch.float32),
-            torch_out.to(torch.float32),
-            rtol=1e-3,
-            atol=1e-5,
-        )
-        torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
-        torch.testing.assert_close(
-            cuda_out.to(torch.float32),
-            torch_eager_out.to(torch.float32),
-            rtol=1e-3,
-            atol=1e-5,
-        )
-        torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
+    if torch.allclose(
+        cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
+    ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
        print("✅ All implementations match")
-    except AssertionError as e:
+    else:
        print("❌ Implementations differ")
-        print(e)


-configs = []
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+
+configs = list(itertools.product(batch_size_range, seq_len_range))


-def benchmark_quantization(
-    batch_size,
-    hidden_size,
-    provider,
-    group_shape: GroupShape,
-    col_major: bool,
-    dtype: torch.dtype,
-):
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "seq_len"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["torch", "cuda"],
+        line_names=["Torch", "CUDA"],
+        styles=[("blue", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="per-token-dynamic-quant-fp8-performance",
+        args={},
+    )
+)
+def benchmark_quantization(batch_size, seq_len, provider):
+    dtype = torch.float16
    device = torch.device("cuda")

-    x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
+    x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]
-    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)

    if provider == "torch":
-        fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
+        fn = lambda: torch_per_token_quant_fp8(x.clone())
    elif provider == "cuda":
-        fn = lambda: quant_fp8.forward_cuda(x.clone())
-    elif provider == "triton":
-        if not group_shape.is_per_group():
-            # Triton only supported for per-group
-            return 0, 0, 0
-
-        fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
+        fn = lambda: cuda_per_token_quant_fp8(x.clone())

    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)

    return 1000 * ms, 1000 * max_ms, 1000 * min_ms


-# TODO(luka) extract to utils
-def compute_geomean_speedups(
-    df: pd.DataFrame,
-    baseline_col: str,
-    speedup_cols: list[str],
-    groupby_cols: list[str] | None = None,
-) -> pd.DataFrame:
-    """
-    Compute geometric mean speedups over a baseline column.
-
-    Args:
-        df: Input dataframe
-        baseline_col: Column to use as baseline
-        speedup_cols: Columns to compute speedups for
-        groupby_cols: Columns to group by. If None, compute over entire df.
-
-    Returns:
-        pd.DataFrame with geometric mean speedups
-    """
-    from scipy.stats import gmean
-
-    def geo_speedup(group: pd.DataFrame) -> pd.Series:
-        ratios = {
-            col: (group[baseline_col] / group[col]).values for col in speedup_cols
-        }
-        return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
-
-    if groupby_cols is None:
-        result = geo_speedup(df).to_frame().T
-    else:
-        result = (
-            df.groupby(groupby_cols)
-            .apply(geo_speedup, include_groups=False)
-            .reset_index()
-        )
-
-    return result
-
-
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
-    )
-    parser.add_argument("-c", "--check", action="store_true")
-    parser.add_argument(
-        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
-    )
-    parser.add_argument(
-        "--hidden-sizes",
-        type=int,
-        nargs="+",
-        default=[896, 1024, 2048, 4096, 7168],
-        help="Hidden sizes to benchmark",
-    )
-    parser.add_argument(
-        "--batch-sizes",
-        type=int,
-        nargs="+",
-        default=[1, 16, 128, 512, 1024],
-        help="Batch sizes to benchmark",
-    )
-    parser.add_argument(
-        "--group-sizes",
-        type=int,
-        nargs="+",
-        default=None,
-        help="Group sizes for GroupShape(1,N) to benchmark. "
-        "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
-    )
-    parser.add_argument(
-        "--no-column-major",
-        action="store_true",
-        help="Disable column-major scales testing",
-    )
-
-    args = parser.parse_args()
-    assert args
-
-    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
-
-    hidden_sizes = args.hidden_sizes
-    batch_sizes = args.batch_sizes
-
-    if args.group_sizes is not None:
-        group_shapes = []
-        for size in args.group_sizes:
-            if size == 0:
-                group_shapes.append(GroupShape.PER_TENSOR)
-            elif size == -1:
-                group_shapes.append(GroupShape.PER_TOKEN)
-            else:
-                group_shapes.append(GroupShape(1, size))
-    else:
-        group_shapes = [
-            GroupShape.PER_TENSOR,
-            GroupShape.PER_TOKEN,
-            GroupShape(1, 64),
-            GroupShape(1, 128),
-        ]
-
-    column_major_scales = [False] if args.no_column_major else [True, False]
-
-    config_gen = itertools.product(
-        group_shapes,
-        column_major_scales,
-        batch_sizes,
-        hidden_sizes,
-    )
-
-    # filter out column-major scales for non-group, reverse order
-    configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
-
-    print(f"Running {len(configs)} configurations:")
-    print(f"  Hidden sizes: {hidden_sizes}")
-    print(f"  Batch sizes: {batch_sizes}")
-    print(f"  Group shapes: {[str(g) for g in group_shapes]}")
-    print(f"  Column major scales: {column_major_scales}")
-    print()
-
-    if args.check:
-        for group_shape in group_shapes:
-            group_size = group_shape[1]
-            print(f"{group_size=}")
-            calculate_diff(
-                batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
-            )
-
-    benchmark = triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
-            x_vals=configs,
-            line_arg="provider",
-            line_vals=["torch", "cuda", "triton"],
-            line_names=["Torch (Compiled)", "CUDA", "Triton"],
-            styles=[("blue", "-"), ("green", "-"), ("black", "-")],
-            ylabel="us",
-            plot_name="QuantFP8 performance",
-            args={},
-        )
-    )(benchmark_quantization)
-
-    df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
-
-    # Print geomean speedups
-    geo_table_grouped = compute_geomean_speedups(
-        df,
-        baseline_col="Torch (Compiled)",
-        speedup_cols=["CUDA", "Triton"],
-        groupby_cols=["col_major", "group_shape"],
-    )
-
-    print("Speedup over Torch (Compiled)")
-    print(geo_table_grouped.to_string(index=False))
+    calculate_diff(batch_size=4, seq_len=4096)
+    benchmark_quantization.run(print_data=True)
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@ -13,10 +13,6 @@ import torch.utils.benchmark as benchmark

 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import (
-    fp8_w8a8_moe_quant_config,
-    nvfp4_moe_quant_config,
-)
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
@ -144,12 +140,6 @@ def bench_run(
        a_fp8_scale: torch.Tensor,
        num_repeats: int,
    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a_fp8_scale,
-        )
-
        for _ in range(num_repeats):
            fused_experts(
                a,
@ -157,7 +147,10 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
-                quant_config=quant_config,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
            )

    def run_cutlass_moe_fp4(
@ -179,27 +172,25 @@ def bench_run(
        device: torch.device,
        num_repeats: int,
    ):
-        quant_config = nvfp4_moe_quant_config(
-            a1_gscale=a1_gs,
-            a2_gscale=a2_gs,
-            w1_scale=w1_blockscale,
-            w2_scale=w2_blockscale,
-            g1_alphas=w1_gs,
-            g2_alphas=w2_gs,
-        )
        for _ in range(num_repeats):
            with nvtx.annotate("cutlass_moe_fp4", color="green"):
                cutlass_moe_fp4(
                    a=a,
+                    a1_gscale=a1_gs,
+                    a2_gscale=a2_gs,
                    w1_fp4=w1_fp4,
+                    w1_blockscale=w1_blockscale,
+                    w1_alphas=w1_gs,
                    w2_fp4=w2_fp4,
+                    w2_blockscale=w2_blockscale,
+                    w2_alphas=w2_gs,
                    topk_weights=topk_weights,
                    topk_ids=topk_ids,
                    m=m,
                    n=n,
                    k=k,
                    e=num_experts,
-                    quant_config=quant_config,
+                    device=device,
                )

    def run_cutlass_from_graph(
@ -220,29 +211,26 @@ def bench_run(
        e: int,
        device: torch.device,
    ):
-        quant_config = nvfp4_moe_quant_config(
-            a1_gscale=a1_gs,
-            a2_gscale=a2_gs,
-            w1_scale=w1_blockscale,
-            w2_scale=w2_blockscale,
-            g1_alphas=w1_gs,
-            g2_alphas=w2_gs,
-        )
-
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
            return cutlass_moe_fp4(
                a=a,
+                a1_gscale=a1_gs,
                w1_fp4=w1_fp4,
+                w1_blockscale=w1_blockscale,
+                w1_alphas=w1_alphas,
+                a2_gscale=a2_gs,
                w2_fp4=w2_fp4,
+                w2_blockscale=w2_blockscale,
+                w2_alphas=w2_alphas,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                m=m,
                n=n,
                k=k,
                e=num_experts,
-                quant_config=quant_config,
+                device=device,
            )

    def run_triton_from_graph(
@ -258,18 +246,16 @@ def bench_run(
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
-            quant_config = fp8_w8a8_moe_quant_config(
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a_fp8_scale,
-            )
            return fused_experts(
                a,
                w1,
                w2,
                topk_weights,
                topk_ids,
-                quant_config=quant_config,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_fp8_scale,
            )

    def replay_graph(graph, num_repeats):
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@ -1,406 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
-kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
-but use different quantization strategies and backends.
-"""
-
-import nvtx
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
-from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
-
-# Weight shapes for different models: [num_experts, topk, hidden_size,
-# intermediate_size]
-WEIGHT_SHAPES_MOE = {
-    "mixtral-8x7b": [
-        [8, 2, 4096, 14336],
-    ],
-    "deepseek-v2": [
-        [160, 6, 5120, 12288],
-    ],
-    "custom-small": [
-        [8, 2, 2048, 7168],
-    ],
-    "glm45-fp8": [
-        [128, 8, 4096, 1408],
-    ],
-    "Llama-4-Maverick-17B-128E-Instruct-FP8": [
-        [128, 1, 5120, 8192],
-    ],
-}
-
-DEFAULT_MODELS = [
-    "mixtral-8x7b",
-]
-
-DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
-DEFAULT_TP_SIZES = [1]
-
-PER_ACT_TOKEN_OPTS = [False, True]
-PER_OUT_CH_OPTS = [False, True]
-
-FP8_DTYPE = current_platform.fp8_dtype()
-
-
-def bench_run(
-    results: list,
-    model: str,
-    num_experts: int,
-    topk: int,
-    per_act_token: bool,
-    per_out_ch: bool,
-    mkn: tuple[int, int, int],
-):
-    (m, k, n) = mkn
-
-    dtype = torch.half
-    device = "cuda"
-
-    # Create input activations
-    a = torch.randn((m, k), device=device, dtype=dtype) / 10
-
-    # Create weights
-    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
-    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
-
-    # Create FP8 quantized weights and scales for both kernels
-    w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
-    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
-
-    # Create scales based on quantization strategy
-    if per_out_ch:
-        # Per-channel quantization
-        w1_scale = torch.empty(
-            (num_experts, 2 * n, 1), device=device, dtype=torch.float32
-        )
-        w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
-    else:
-        # Per-tensor quantization
-        w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
-        w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
-
-    # Quantize weights
-    for expert in range(num_experts):
-        if per_out_ch:
-            # Per-channel quantization - not yet implemented properly
-            # For now, fall back to per-tensor quantization
-            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
-            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
-            # Expand scalar scales to the expected per-channel shape
-            w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
-            w2_scale[expert] = w2_scale_temp.expand(k, 1)
-        else:
-            # Per-tensor quantization
-            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
-            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
-            # Store scalar scales in [1, 1] tensors
-            w1_scale[expert, 0, 0] = w1_scale_temp
-            w2_scale[expert, 0, 0] = w2_scale_temp
-
-    # Prepare weights for CUTLASS (no transpose needed)
-    w1_fp8q_cutlass = w1_fp8q  # Keep original [E, 2N, K]
-    w2_fp8q_cutlass = w2_fp8q  # Keep original [E, K, N]
-
-    # Create router scores and get topk
-    score = torch.randn((m, num_experts), device=device, dtype=dtype)
-    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
-
-    # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
-    # Force per-tensor quantization for all cases to match working e2e setup
-    a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
-    a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
-
-    # Force per-tensor quantization for all cases
-    per_act_token = False
-
-    # Create stride tensors for CUTLASS
-    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
-    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
-    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-
-    def run_triton_moe(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            fused_experts(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-
-    def run_cutlass_moe_fp8(
-        a: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        a2_scale: torch.Tensor,
-        num_repeats: int,
-    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-        )
-
-        for _ in range(num_repeats):
-            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
-                cutlass_moe_fp8(
-                    a=a,
-                    w1_q=w1,
-                    w2_q=w2,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    ab_strides1=ab_strides1,
-                    ab_strides2=ab_strides2,
-                    c_strides1=c_strides1,
-                    c_strides2=c_strides2,
-                    quant_config=quant_config,
-                    activation="silu",
-                    global_num_experts=num_experts,
-                )
-
-    # Pre-create quantization config to avoid creating it inside CUDA graph
-    quant_config = fp8_w8a8_moe_quant_config(
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
-        a1_scale=a1_scale,
-        a2_scale=a2_scale,
-        per_act_token_quant=per_act_token,
-        per_out_ch_quant=per_out_ch,
-    )
-
-    # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
-    cutlass_stream = torch.cuda.Stream()
-    cutlass_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
-        # Capture 10 invocations like benchmark_moe.py
-        for _ in range(10):
-            cutlass_moe_fp8(
-                a=a,
-                w1_q=w1_fp8q_cutlass,
-                w2_q=w2_fp8q_cutlass,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                ab_strides1=ab_strides1,
-                ab_strides2=ab_strides2,
-                c_strides1=c_strides1,
-                c_strides2=c_strides2,
-                quant_config=quant_config,
-                activation="silu",
-                global_num_experts=num_experts,
-            )
-    torch.cuda.synchronize()
-
-    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
-    triton_stream = torch.cuda.Stream()
-    triton_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(triton_graph, stream=triton_stream):
-        # Capture 10 invocations like benchmark_moe.py
-        for _ in range(10):
-            fused_experts(
-                a,
-                w1_fp8q,
-                w2_fp8q,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-    torch.cuda.synchronize()
-
-    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
-        """Benchmark CUDA graph using events like benchmark_moe.py"""
-        # Warmup
-        for _ in range(num_warmup):
-            graph.replay()
-        torch.cuda.synchronize()
-
-        # Timing
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-
-        latencies = []
-        for _ in range(num_iters):
-            torch.cuda.synchronize()
-            start_event.record()
-            graph.replay()
-            end_event.record()
-            end_event.synchronize()
-            latencies.append(start_event.elapsed_time(end_event))
-
-        # Divide by 10 since graph contains 10 calls
-        return sum(latencies) / (num_iters * 10)
-
-    # Benchmark parameters
-    num_warmup = 5
-    num_iters = 100
-
-    # Benchmark only CUDA graphs (more reliable and faster)
-    # Benchmark Triton MoE with CUDA graphs
-    triton_graph_time = bench_cuda_graph(
-        triton_graph, num_warmup=num_warmup, num_iters=num_iters
-    )
-
-    # Benchmark CUTLASS MoE with CUDA graphs
-    cutlass_graph_time = bench_cuda_graph(
-        cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
-    )
-
-    # Convert ms to us and return results
-    triton_time_us = triton_graph_time * 1000
-    cutlass_time_us = cutlass_graph_time * 1000
-
-    return {
-        "batch_size": m,
-        "triton_time_us": triton_time_us,
-        "cutlass_time_us": cutlass_time_us,
-    }
-
-
-def main(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    all_results = []
-
-    for model in args.models:
-        for tp in args.tp_sizes:
-            for layer in WEIGHT_SHAPES_MOE[model]:
-                num_experts = layer[0]
-                topk = layer[1]
-                size_k = layer[2]
-                size_n = layer[3] // tp
-
-                if len(args.limit_k) > 0 and size_k not in args.limit_k:
-                    continue
-
-                if len(args.limit_n) > 0 and size_n not in args.limit_n:
-                    continue
-
-                for per_act_token in args.per_act_token_opts:
-                    for per_out_ch in args.per_out_ch_opts:
-                        print(
-                            f"\n=== {model}, experts={num_experts}, topk={topk},"
-                            f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
-                        )
-
-                        config_results = []
-                        for size_m in args.batch_sizes:
-                            mkn = (size_m, size_k, size_n)
-                            result = bench_run(
-                                [],  # Not used anymore
-                                model,
-                                num_experts,
-                                topk,
-                                per_act_token,
-                                per_out_ch,
-                                mkn,
-                            )
-                            if result:
-                                config_results.append(result)
-
-                        # Print results table for this configuration
-                        if config_results:
-                            print(
-                                f"\n{'Batch Size':<12}"
-                                f"{'Triton (us)':<15}"
-                                f"{'CUTLASS (us)':<15}"
-                            )
-                            print("-" * 45)
-                            for result in config_results:
-                                print(
-                                    f"{result['batch_size']:<12}"
-                                    f"{result['triton_time_us']:<15.2f}"
-                                    f"{result['cutlass_time_us']:<15.2f}"
-                                )
-
-                            all_results.extend(config_results)
-
-    print(f"\nTotal benchmarks completed: {len(all_results)}")
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
-         across specified models/shapes/batches
-
-        Example usage:
-        python benchmark_cutlass_moe_fp8.py  \
-            --model "Llama-4-Maverick-17B-128E-Instruct-FP8"  \
-            --tp-sizes 8 \
-            --batch-size 2 4 8  \
-            --per-act-token-opts false \
-            --per-out-ch-opts false
-
-        """
-    )
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=DEFAULT_MODELS,
-        choices=WEIGHT_SHAPES_MOE.keys(),
-    )
-    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
-    parser.add_argument(
-        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
-    )
-    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
-    parser.add_argument(
-        "--per-act-token-opts",
-        nargs="+",
-        type=lambda x: x.lower() == "true",
-        default=[False, True],
-        help="Per-activation token quantization options (true/false)",
-    )
-    parser.add_argument(
-        "--per-out-ch-opts",
-        nargs="+",
-        type=lambda x: x.lower() == "true",
-        default=[False, True],
-        help="Per-output channel quantization options (true/false)",
-    )
-
-    args = parser.parse_args()
-    main(args)
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -7,10 +7,6 @@ Benchmark script for device communicators:
 CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 and SymmMemCommunicator (multimem, two-shot).

-for NCCL symmetric memory you need to set the environment variables
-NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
-not use fast NVLS implementation for all reduce.
-
 Usage:
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]

@ -30,13 +26,7 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup

 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
-from vllm.distributed.device_communicators.pynccl import (
-    PyNcclCommunicator,
-    register_nccl_symmetric_ops,
-)
-from vllm.distributed.device_communicators.pynccl_allocator import (
-    set_graph_pool_id,
-)
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
 from vllm.utils import FlexibleArgumentParser
@ -108,7 +98,6 @@ class CommunicatorBenchmark:
            )
            if not self.pynccl_comm.disabled:
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
-                register_nccl_symmetric_ops(self.pynccl_comm)
            else:
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
                self.pynccl_comm = None
@ -205,15 +194,6 @@ class CommunicatorBenchmark:
                    None,  # no env variable needed
                )
            )
-            communicators.append(
-                (
-                    "pynccl-symm",
-                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
-                    lambda t: True,  # Always available if initialized
-                    nullcontext(),
-                    None,  # no env variable needed
-                )
-            )

        if self.symm_mem_comm_multimem is not None:
            comm = self.symm_mem_comm_multimem
@ -291,9 +271,7 @@ class CommunicatorBenchmark:
                # Capture the graph using context manager
                with context:
                    graph = torch.cuda.CUDAGraph()
-                    graph_pool = torch.cuda.graph_pool_handle()
-                    set_graph_pool_id(graph_pool)
-                    with torch.cuda.graph(graph, pool=graph_pool):
+                    with torch.cuda.graph(graph):
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -7,7 +7,6 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE

 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
@ -97,11 +96,6 @@ def bench_run(
        a_scale: torch.Tensor,
        num_repeats: int,
    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a_scale,
-        )
        for _ in range(num_repeats):
            fused_experts(
                a,
@ -109,7 +103,10 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
-                quant_config=quant_config,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_scale,
            )

    def run_cutlass_moe(
@ -128,12 +125,6 @@ def bench_run(
        per_act_token: bool,
        num_repeats: int,
    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            per_act_token_quant=per_act_token,
-        )
-
        for _ in range(num_repeats):
            cutlass_moe_fp8(
                a,
@ -141,11 +132,14 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
+                w1_scale,
+                w2_scale,
                ab_strides1,
                ab_strides2,
                c_strides1,
                c_strides2,
-                quant_config=quant_config,
+                per_act_token,
+                a1_scale=None,
            )

    def run_cutlass_from_graph(
@ -162,12 +156,6 @@ def bench_run(
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            per_act_token_quant=per_act_token,
-        )
-
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
@ -177,11 +165,14 @@ def bench_run(
                w2_q,
                topk_weights,
                topk_ids,
+                w1_scale,
+                w2_scale,
                ab_strides1,
                ab_strides2,
                c_strides1,
                c_strides2,
-                quant_config=quant_config,
+                per_act_token,
+                a1_scale=None,
            )

    def run_triton_from_graph(
@ -194,11 +185,6 @@ def bench_run(
        w2_scale: torch.Tensor,
        a_scale: torch.Tensor,
    ):
-        quant_config = fp8_w8a8_moe_quant_config(
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a_scale,
-        )
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
        ):
@ -208,7 +194,10 @@ def bench_run(
                w2,
                topk_weights,
                topk_ids,
-                quant_config=quant_config,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a_scale,
            )

    def replay_graph(graph, num_repeats):
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(


 def make_rand_tensors(
-    a_shape: tuple[int, ...],
-    b_shape: tuple[int, ...],
-    c_shape: tuple[int, ...],
+    a_shape: tuple[int],
+    b_shape: tuple[int],
+    c_shape: tuple[int],
    a_dtype: torch.dtype,
    b_dtype: torch.dtype,
    c_dtype: torch.dtype,
@ -243,7 +243,7 @@ class OpType(Enum):
        lora_rank: int,
        num_loras: int,
        num_slices: int,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+    ) -> tuple[tuple[int], tuple[int], tuple[int]]:
        """
        Given num_slices, return the shapes of the A, B, and C matrices
        in A x B = C, for the op_type
@ -464,11 +464,7 @@ class BenchmarkTensors:
        for field_name in LoRAKernelMeta.__dataclass_fields__:
            field = getattr(self.lora_kernel_meta, field_name)
            assert isinstance(field, torch.Tensor)
-            setattr(
-                self.lora_kernel_meta,
-                field_name,
-                to_device(field) if field_name != "no_lora_flag_cpu" else field,
-            )
+            setattr(self.lora_kernel_meta, field_name, to_device(field))

    def metadata(self) -> tuple[int, int, int]:
        """
@ -516,7 +512,6 @@ class BenchmarkTensors:
            "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
            "lora_ids": self.lora_kernel_meta.active_lora_ids,
            "scaling": 1.0,
-            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
        }

    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
@ -557,7 +552,6 @@ class BenchmarkTensors:
            "lora_ids": self.lora_kernel_meta.active_lora_ids,
            "offset_start": 0,
            "add_inputs": add_inputs,
-            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
        }

    def bench_fn_kwargs(
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -14,10 +14,6 @@ import ray
 import torch
 from ray.experimental.tqdm_ray import tqdm

-from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEQuantConfig,
-    _get_config_dtype_str,
-)
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
@ -138,36 +134,43 @@ def benchmark_config(
    def run():
        from vllm.model_executor.layers.fused_moe import override_config

-        if use_fp8_w8a8:
-            quant_dtype = torch.float8_e4m3fn
-        elif use_int8_w8a16:
-            quant_dtype = torch.int8
-        else:
-            quant_dtype = None
-
-        quant_config = FusedMoEQuantConfig.make(
-            quant_dtype=quant_dtype,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            block_shape=block_quant_shape,
-        )
-
        with override_config(config):
-            topk_weights, topk_ids, token_expert_indices = fused_topk(
-                x, input_gating, topk, renormalize=not use_deep_gemm
-            )
-            return fused_experts(
-                x,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                inplace=True,
-                quant_config=quant_config,
-                allow_deep_gemm=use_deep_gemm,
-            )
+            if use_deep_gemm:
+                topk_weights, topk_ids, token_expert_indices = fused_topk(
+                    x, input_gating, topk, False
+                )
+                return fused_experts(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                    allow_deep_gemm=True,
+                )
+            else:
+                fused_moe(
+                    x,
+                    w1,
+                    w2,
+                    input_gating,
+                    topk,
+                    renormalize=True,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    use_int8_w8a16=use_int8_w8a16,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                )

    # JIT compilation & warmup
    run()
@ -411,7 +414,7 @@ class BenchmarkWorker:
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
-        dtype_str = _get_config_dtype_str(
+        dtype_str = get_config_dtype_str(
            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
@ -544,7 +547,7 @@ def save_configs(
    block_quant_shape: list[int],
    save_dir: str,
 ) -> None:
-    dtype_str = _get_config_dtype_str(
+    dtype_str = get_config_dtype_str(
        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
    )

@ -557,7 +560,7 @@ def save_configs(
    filename = os.path.join(save_dir, filename)
    print(f"Writing best config to {filename}...")
    with open(filename, "w") as f:
-        json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
+        json.dump(configs, f, indent=4)
        f.write("\n")


@ -584,9 +587,8 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
    elif config.architectures[0] in (
-        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
-        "DeepseekV32ForCausalLM",
+        "DeepseekV2ForCausalLM",
        "Glm4MoeForCausalLM",
    ):
        E = config.n_routed_experts
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,174 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-import random
-import time
-
-import torch
-from tabulate import tabulate
-
-from vllm import _custom_ops as ops
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.utils import (
-    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
-    create_kv_caches_with_random,
-)
-
-logger = init_logger(__name__)
-
-
-@torch.inference_mode()
-def run_benchmark(
-    num_tokens: int,
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: str,
-    num_iters: int,
-    benchmark_mode: str,
-    device: str = "cuda",
-) -> float:
-    """Return latency (seconds) for given num_tokens."""
-
-    if kv_cache_dtype == "fp8" and head_size % 16:
-        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
-
-    current_platform.seed_everything(42)
-    torch.set_default_device(device)
-
-    # create random key / value tensors [T, H, D].
-    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
-    value = torch.randn_like(key)
-
-    # prepare the slot mapping.
-    # each token is assigned a unique slot in the KV-cache.
-    num_slots = block_size * num_blocks
-    if num_tokens > num_slots:
-        raise ValueError("num_tokens cannot exceed the total number of cache slots")
-    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
-
-    key_caches, value_caches = create_kv_caches_with_random(
-        num_blocks,
-        block_size,
-        1,  # num_layers
-        num_heads,
-        head_size,
-        kv_cache_dtype,
-        dtype,
-        device=device,
-    )
-    key_cache, value_cache = key_caches[0], value_caches[0]
-    # to free unused memory
-    del key_caches, value_caches
-
-    # compute per-kernel scaling factors for fp8 conversion (if used).
-    k_scale = (key.amax() / 64.0).to(torch.float32)
-    v_scale = (value.amax() / 64.0).to(torch.float32)
-
-    function_under_test = lambda: ops.reshape_and_cache(
-        key,  # noqa: F821
-        value,  # noqa: F821
-        key_cache,  # noqa: F821
-        value_cache,  # noqa: F821
-        slot_mapping,  # noqa: F821
-        kv_cache_dtype,
-        k_scale,
-        v_scale,
-    )
-
-    if benchmark_mode == "cudagraph":
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g):
-            function_under_test()
-        torch.cuda.synchronize()
-        function_under_test = lambda: g.replay()
-
-    def run_cuda_benchmark(n_iters: int) -> float:
-        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
-        start = time.perf_counter()
-        for _ in range(n_iters):
-            function_under_test()
-            torch.cuda.synchronize()
-        end = time.perf_counter()
-        return (end - start) / n_iters
-
-    # warm-up
-    run_cuda_benchmark(3)
-
-    lat = run_cuda_benchmark(num_iters)
-
-    # free tensors to mitigate OOM when sweeping
-    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
-
-    return lat
-
-
-def main(args):
-    rows = []
-    for exp in range(1, 17):
-        n_tok = 2**exp
-        lat = run_benchmark(
-            num_tokens=n_tok,
-            num_heads=args.num_heads,
-            head_size=args.head_size,
-            block_size=args.block_size,
-            num_blocks=args.num_blocks,
-            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-            kv_cache_dtype=args.kv_cache_dtype,
-            num_iters=args.iters,
-            benchmark_mode=args.mode,
-            device="cuda",
-        )
-        rows.append([n_tok, lat * 1e6])  # convert to microseconds
-
-    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
-    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-
-    parser.add_argument("--num-heads", type=int, default=128)
-    parser.add_argument(
-        "--head-size",
-        type=int,
-        choices=[64, 80, 96, 112, 120, 128, 192, 256],
-        default=128,
-    )
-    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
-    parser.add_argument("--num-blocks", type=int, default=128 * 128)
-
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        choices=["half", "bfloat16", "float"],
-        default="bfloat16",
-    )
-
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        choices=["auto", "fp8"],
-        default="auto",
-    )
-
-    parser.add_argument("--iters", type=int, default=200)
-
-    parser.add_argument(
-        "--mode",
-        type=str,
-        choices=["cudagraph", "no_graph"],
-        default="cudagraph",
-    )
-
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -9,9 +9,6 @@ import torch
 from tabulate import tabulate

 from vllm import _custom_ops as ops
-from vllm.attention.ops.triton_reshape_and_cache_flash import (
-    triton_reshape_and_cache_flash,
-)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (
@ -34,8 +31,6 @@ def run_benchmark(
    kv_cache_dtype: str,
    kv_cache_layout: str,
    num_iters: int,
-    implementation: str,
-    benchmark_mode: str,
    device: str = "cuda",
 ) -> float:
    """Return latency (seconds) for given num_tokens."""
@ -43,14 +38,6 @@ def run_benchmark(
    if kv_cache_dtype == "fp8" and head_size % 16:
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")

-    if implementation not in ("cuda", "triton"):
-        raise ValueError(
-            f"Unsupported implementation: {implementation}. "
-            "Only 'cuda' and 'triton' are supported."
-        )
-    if implementation == "triton" and kv_cache_layout == "HND":
-        return float("nan")  # Triton does not support HND layout yet.
-
    current_platform.seed_everything(42)
    torch.set_default_device(device)

@ -78,49 +65,27 @@ def run_benchmark(
        cache_layout=kv_cache_layout,
    )
    key_cache, value_cache = key_caches[0], value_caches[0]
-    # to free unused memory
-    del key_caches, value_caches

    # compute per-kernel scaling factors for fp8 conversion (if used).
    k_scale = (key.amax() / 64.0).to(torch.float32)
    v_scale = (value.amax() / 64.0).to(torch.float32)

-    if implementation == "cuda":
-        function_under_test = lambda: ops.reshape_and_cache_flash(
-            key,  # noqa: F821
-            value,  # noqa: F821
-            key_cache,  # noqa: F821
-            value_cache,  # noqa: F821
-            slot_mapping,  # noqa: F821
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-    else:
-        function_under_test = lambda: triton_reshape_and_cache_flash(
-            key,  # noqa: F821
-            value,  # noqa: F821
-            key_cache,  # noqa: F821
-            value_cache,  # noqa: F821
-            slot_mapping,  # noqa: F821
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-    if benchmark_mode == "cudagraph":
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g):
-            function_under_test()
-        torch.cuda.synchronize()
-        function_under_test = lambda: g.replay()
-
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
-            function_under_test()
-            torch.cuda.synchronize()
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+        torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@ -151,16 +116,10 @@ def main(args):
                kv_cache_dtype=args.kv_cache_dtype,
                kv_cache_layout=layout,
                num_iters=args.iters,
-                implementation=args.implementation,
-                benchmark_mode=args.mode,
                device="cuda",
            )
            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])

-    print(
-        f"Benchmark results for implementation {args.implementation}"
-        f" (measuring with {args.mode}):"
-    )
    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))


@ -192,21 +151,6 @@ if __name__ == "__main__":
    )

    parser.add_argument("--iters", type=int, default=100)
-
-    parser.add_argument(
-        "--implementation",
-        type=str,
-        choices=["cuda", "triton"],
-        default="cuda",
-    )
-
-    parser.add_argument(
-        "--mode",
-        type=str,
-        choices=["cudagraph", "no_graph"],
-        default="cudagraph",
-    )
-
    args = parser.parse_args()

    main(args)
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,675 +1,77 @@
+#!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Callable
+import time

-import matplotlib.pyplot as plt
-import numpy as np
 import torch

 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm_cuda,
+    silu_mul_fp8_quant_deep_gemm,
 )
 from vllm.platforms import current_platform
-from vllm.triton_utils import tl, triton
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used


-@triton.jit
-def _silu_mul_fp8_quant_deep_gemm(
-    # Pointers ------------------------------------------------------------
-    input_ptr,  # 16-bit activations (E, T, 2*H)
-    y_q_ptr,  # fp8 quantized activations (E, T, H)
-    y_s_ptr,  # 16-bit scales (E, T, G)
-    counts_ptr,  # int32 num tokens per expert (E)
-    # Sizes ---------------------------------------------------------------
-    H: tl.constexpr,  # hidden dimension (per output)
-    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
-    # Strides for input (elements) ---------------------------------------
-    stride_i_e,
-    stride_i_t,
-    stride_i_h,
-    # Strides for y_q (elements) -----------------------------------------
-    stride_yq_e,
-    stride_yq_t,
-    stride_yq_h,
-    # Strides for y_s (elements) -----------------------------------------
-    stride_ys_e,
-    stride_ys_t,
-    stride_ys_g,
-    # Stride for counts (elements)
-    stride_counts_e,
-    # Numeric params ------------------------------------------------------
-    eps: tl.constexpr,
-    fp8_min: tl.constexpr,
-    fp8_max: tl.constexpr,
-    use_ue8m0: tl.constexpr,
-    # Meta ---------------------------------------------------------------
-    BLOCK: tl.constexpr,
-    NUM_STAGES: tl.constexpr,
-):
-    G = H // GROUP_SIZE
-
-    # map program id -> (e, g)
-    pid = tl.program_id(0)
-    e = pid // G
-    g = pid % G
-
-    e = e.to(tl.int64)
-    g = g.to(tl.int64)
-
-    # number of valid tokens for this expert
-    n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
-
-    cols = tl.arange(0, BLOCK).to(tl.int64)
-    mask = cols < BLOCK
-
-    base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
-    base_gate_offset = base_input_offset + cols * stride_i_h
-    base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
-    base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
-    base_ys_offset = e * stride_ys_e + g * stride_ys_g
-
-    for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
-        gate = tl.load(
-            input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
-        ).to(tl.float32)
-        up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
-
-        gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
-        y = gate * up
-
-        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
-        if use_ue8m0:
-            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
-
-        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
-
-        tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
-        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
-
-
-def silu_mul_fp8_quant_deep_gemm_triton(
-    y: torch.Tensor,  # (E, T, 2*H)
-    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
-    num_parallel_tokens,
-    group_size: int = 128,
-    eps: float = 1e-10,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
-
-    y has shape (E, T, 2*H). The first half of the last dimension is
-    silu-activated, multiplied by the second half, then quantized into FP8.
-
-    Returns `(y_q, y_s)` where
-    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
-    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
-    """
-    assert y.ndim == 3, "y must be (E, T, 2*H)"
-    E, T, H2 = y.shape
-    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
-    H = H2 // 2
-    G = (H + group_size - 1) // group_size
-    assert H % group_size == 0, "H must be divisible by group_size"
-    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
-        "tokens_per_expert must be shape (E,)"
+def benchmark(E, T, H, G=128, runs=50):
+    current_platform.seed_everything(42)
+    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
+    tokens_per_expert = torch.randint(
+        T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
    )
-    tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
-
-    # allocate outputs
-    fp8_dtype = torch.float8_e4m3fn
-    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
-
-    # strides (elements)
-    stride_i_e, stride_i_t, stride_i_h = y.stride()
-    stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
-
-    # desired scale strides (elements): (T*G, 1, T)
-    stride_ys_e = T * G
-    stride_ys_t = 1
-    stride_ys_g = T
-    y_s = torch.empty_strided(
-        (E, T, G),
-        (stride_ys_e, stride_ys_t, stride_ys_g),
-        dtype=torch.float32,
-        device=y.device,
-    )
-
-    stride_cnt_e = tokens_per_expert.stride()[0]
-
-    # Static grid over experts and H-groups.
-    # A loop inside the kernel handles the token dim
-    grid = (E * G,)
-
-    f_info = torch.finfo(fp8_dtype)
-    fp8_max = f_info.max
-    fp8_min = f_info.min
-
-    _silu_mul_fp8_quant_deep_gemm[grid](
-        y,
-        y_q,
-        y_s,
-        tokens_per_expert,
-        H,
-        group_size,
-        stride_i_e,
-        stride_i_t,
-        stride_i_h,
-        stride_yq_e,
-        stride_yq_t,
-        stride_yq_h,
-        stride_ys_e,
-        stride_ys_t,
-        stride_ys_g,
-        stride_cnt_e,
-        eps,
-        fp8_min,
-        fp8_max,
-        is_deep_gemm_e8m0_used(),
-        BLOCK=group_size,
-        NUM_STAGES=4,
-        num_warps=1,
-    )
-
-    return y_q, y_s
-
-
-# Parse generation strategies
-strategies = ["uniform", "max_t", "first_t"]
-
-
-def benchmark(
-    kernel: Callable,
-    E: int,
-    T: int,
-    H: int,
-    total_tokens: int,
-    num_parallel_tokens: int = 64,
-    G: int = 128,
-    runs: int = 200,
-    num_warmups: int = 20,
-    gen_strategy: str = "default",
-    iterations_per_run: int = 20,
-):
-    def generate_data(seed_offset=0):
-        """Generate input data with given seed offset"""
-        current_platform.seed_everything(42 + seed_offset)
-        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
-
-        if gen_strategy == "uniform":
-            r = torch.rand(size=(E,), device="cuda")
-            r /= r.sum()
-            r *= total_tokens
-            tokens_per_expert = r.int()
-            tokens_per_expert = torch.minimum(
-                tokens_per_expert,
-                torch.ones((E,), device=r.device, dtype=torch.int) * T,
-            )
-        elif gen_strategy == "max_t":
-            tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
-            tokens_per_expert.fill_(total_tokens / E)
-        elif gen_strategy == "first_t":
-            tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
-            tokens_per_expert[0] = min(T, total_tokens)
-        else:
-            raise ValueError(f"Unknown generation strategy: {gen_strategy}")
-        return y, tokens_per_expert
-
-    dataset_count = 4
-    # Pre-generate different input matrices for each iteration to avoid cache effects
-    data_sets = [generate_data(i) for i in range(dataset_count)]

    # Warmup
-    y, tokens_per_expert = data_sets[0]
-    for _ in range(num_warmups):
-        kernel(
-            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
-        )
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    # Benchmark
-    latencies: list[float] = []
-    for _ in range(runs):
+    for _ in range(10):
+        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
        torch.cuda.synchronize()

-        start_event.record()
-        for i in range(iterations_per_run):
-            y, tokens_per_expert = data_sets[i % dataset_count]
-            kernel(
-                y,
-                tokens_per_expert,
-                num_parallel_tokens=num_parallel_tokens,
-                group_size=G,
-            )
-        end_event.record()
-        end_event.synchronize()
+    # Benchmark
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for _ in range(runs):
+        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
+    torch.cuda.synchronize()

-        total_time_ms = start_event.elapsed_time(end_event)
-        per_iter_time_ms = total_time_ms / iterations_per_run
-        latencies.append(per_iter_time_ms)
+    avg_time = (time.perf_counter() - start) / runs * 1000

-    # Use median instead of average for better outlier handling
-    median_time_ms = np.median(latencies)
-    median_time_s = median_time_ms / 1000
-
-    # Calculate actual work done (using first dataset for consistency)
-    _, tokens_per_expert = data_sets[0]
+    # Calculate actual work done (only count valid tokens)
    actual_tokens = tokens_per_expert.sum().item()
    actual_elements = actual_tokens * H

    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
    ops_per_element = 8
    total_ops = actual_elements * ops_per_element
-    gflops = total_ops / median_time_s / 1e9
+    gflops = total_ops / (avg_time / 1000) / 1e9

    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
    output_bytes = actual_tokens * H * 1  # H fp8 outputs
    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
    total_bytes = input_bytes + output_bytes + scale_bytes
-    memory_bw = total_bytes / median_time_s / 1e9
+    memory_bw = total_bytes / (avg_time / 1000) / 1e9

-    HOPPER_BANDWIDTH_TBPS = 3.35
-    return (
-        median_time_ms,
-        gflops,
-        memory_bw,
-        (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
-    )
+    return avg_time, gflops, memory_bw


-def create_comparison_plot(
-    ratio, cuda_times, baseline_times, config_labels, strategy_name, id
-):
-    """Create a comparison plot for a specific generation strategy"""
-    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
-
-    # Configure x-axis positions
-    x = np.arange(len(config_labels))
-    width = 0.35
-
-    # Execution Time plot (lower is better)
-    ax.bar(
-        x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
-    )
-    ax.bar(
-        x + width / 2,
-        baseline_times,
-        width,
-        label="Baseline",
-        alpha=0.8,
-        color="orange",
-    )
-
-    # Add speedup labels over each bar pair
-    for i in range(len(x)):
-        speedup = ratio[i]
-        max_height = max(cuda_times[i], baseline_times[i])
-        ax.text(
-            x[i],
-            max_height + max_height * 0.02,
-            f"{speedup:.2f}x",
-            ha="center",
-            va="bottom",
-            fontweight="bold",
-            fontsize=9,
-        )
-
-    ax.set_xlabel("Configuration")
-    ax.set_ylabel("% Utilization")
-    ax.set_title(
-        f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
-    )
-    ax.set_xticks(x)
-    ax.set_xticklabels(config_labels, rotation=45, ha="right")
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    return fig, ax
-
-
-def create_combined_plot(all_results):
-    """Create a combined plot with all strategies in one PNG"""
-    num_strategies = len(all_results)
-    fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
-
-    if num_strategies == 1:
-        axes = [axes]
-
-    for idx, (
-        strategy_name,
-        ratio,
-        cuda_times,
-        baseline_times,
-        config_labels,
-    ) in enumerate(all_results):
-        ax = axes[idx]
-
-        # Configure x-axis positions
-        x = np.arange(len(config_labels))
-        width = 0.35
-
-        # Execution Time plot (lower is better)
-        ax.bar(
-            x - width / 2,
-            cuda_times,
-            width,
-            label="CUDA Kernel",
-            alpha=0.8,
-            color="blue",
-        )
-        ax.bar(
-            x + width / 2,
-            baseline_times,
-            width,
-            label="Baseline",
-            alpha=0.8,
-            color="orange",
-        )
-
-        # Add speedup labels over each bar pair
-        for i in range(len(x)):
-            speedup = ratio[i]
-            max_height = max(cuda_times[i], baseline_times[i])
-            ax.text(
-                x[i],
-                max_height + max_height * 0.02,
-                f"{speedup:.2f}x",
-                ha="center",
-                va="bottom",
-                fontweight="bold",
-                fontsize=9,
-            )
-
-        ax.set_xlabel("Configuration")
-        ax.set_ylabel("% Utilization")
-        ax.set_title(
-            f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
-        )
-        ax.set_xticks(x)
-        ax.set_xticklabels(config_labels, rotation=45, ha="right")
-        ax.legend()
-        ax.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    filename = "../../silu_bench/silu_benchmark_combined.png"
-    plt.savefig(filename, dpi=300, bbox_inches="tight")
-    plt.show()
-
-    return filename
-
-
-outer_dim = 7168
 configs = [
+    (8, 32, 1024),
+    (16, 64, 2048),
+    (32, 128, 4096),
    # DeepSeekV3 Configs
-    (8, 1024, 7168),
-    # DeepSeekV3 Configs
-    (32, 1024, 7168),
-    # DeepSeekV3 Configs
+    (256, 16, 7168),
+    (256, 32, 7168),
+    (256, 64, 7168),
+    (256, 128, 7168),
+    (256, 256, 7168),
+    (256, 512, 7168),
    (256, 1024, 7168),
 ]

-runs = 100
-num_warmups = 20
-
-strategy_descriptions = {
-    "uniform": "Uniform Random",
-    "max_t": "Even Assignment",
-    "first_t": "experts[0] = T, experts[1:] = 0",
-}
-
 print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"Testing strategies: {', '.join(strategies)}")
-print(f"Configurations: {len(configs)} configs")
+print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
+print("-" * 50)

-all_results = []
-
-# Run benchmarks for each strategy
-for id, strategy in enumerate(strategies):
-    print(f"\n{'=' * 60}")
-    print(f"Testing strategy: {strategy_descriptions[strategy]}")
-    print(f"{'=' * 60}")
-
-    # Collect benchmark data for both algorithms
-    config_labels = []
-    config_x_axis = []
-    all_cuda_results = []
-    all_baseline_results = []
-    all_ratios = []
-
-    for E, T, H in configs:
-        total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
-        config_x_axis.append(total_tokens_config)
-
-        cuda_results = []
-        baseline_results = []
-        ratios = []
-
-        for total_tokens in total_tokens_config:
-            config_label = f"E={E},T={T},H={H},TT={total_tokens}"
-            config_labels.append(config_label)
-
-            # CUDA kernel results
-            time_ms_cuda, gflops, gbps, perc = benchmark(
-                silu_mul_fp8_quant_deep_gemm_cuda,
-                E,
-                T,
-                H,
-                total_tokens,
-                runs=runs,
-                num_warmups=num_warmups,
-                gen_strategy=strategy,
-            )
-            cuda_results.append((time_ms_cuda, gflops, gbps, perc))
-
-            # Baseline results
-            time_ms_triton, gflops, gbps, perc = benchmark(
-                silu_mul_fp8_quant_deep_gemm_triton,
-                E,
-                T,
-                H,
-                total_tokens,
-                runs=runs,
-                num_warmups=num_warmups,
-                gen_strategy=strategy,
-            )
-            baseline_results.append((time_ms_triton, gflops, gbps, perc))
-            ratios.append(time_ms_triton / time_ms_cuda)
-
-            print(f"Completed: {config_label}")
-        all_cuda_results.append(cuda_results)
-        all_baseline_results.append(baseline_results)
-        all_ratios.append(ratios)
-
-    # Store results for combined plotting
-    all_results.append(
-        (
-            strategy_descriptions[strategy],
-            all_ratios,
-            all_cuda_results,
-            all_baseline_results,
-            config_labels,
-            config_x_axis,
-        )
-    )
-
-    # Print summary table for this strategy
-    print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
-    print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
-    print("-" * 60)
-
-    for i, (E, T, H) in enumerate(configs):
-        speedup = baseline_results[i][0] / cuda_results[i][0]
-        config_label = f"E={E:3d},T={T:4d},H={H:4d}"
-        print(
-            f"{config_label:<20} {cuda_results[i][0]:8.5f} "
-            f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
-        )
-
-
-def create_total_tokens_plot(all_results):
-    num_strategies = len(all_results)
-    num_configs = len(configs)
-
-    # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
-    fig, axs = plt.subplots(
-        num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
-    )
-
-    # Add main title to the entire figure
-    fig.suptitle(
-        "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
-        fontsize=16,
-        fontweight="bold",
-        y=0.98,
-    )
-
-    # Handle single strategy case
-    if num_strategies == 1:
-        axs = axs.reshape(1, -1)
-
-    # Handle single config case
-    if num_configs == 1:
-        axs = axs.reshape(-1, 2)
-
-    for strategy_idx, result in enumerate(all_results):
-        (
-            strategy_name,
-            all_ratios,
-            all_cuda_results,
-            all_baseline_results,
-            config_labels,
-            config_x_axis,
-        ) = result
-
-        for config_idx in range(num_configs):
-            # Speedup plot (left column)
-            ax_speedup = axs[strategy_idx, config_idx * 2]
-            # Bandwidth plot (right column)
-            ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
-
-            E, T, H = configs[config_idx]
-            ratios = all_ratios[config_idx]
-            total_tokens_values = config_x_axis[config_idx]
-
-            # Extract CUDA and Triton bandwidth percentages
-            cuda_bandwidth_percentages = [
-                result[3] for result in all_cuda_results[config_idx]
-            ]
-            triton_bandwidth_percentages = [
-                result[3] for result in all_baseline_results[config_idx]
-            ]
-
-            # Plot speedup ratios vs total tokens (left plot)
-            ax_speedup.plot(
-                total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
-            )
-            ax_speedup.set_title(
-                f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
-                fontsize=12,
-                fontweight="bold",
-            )
-            ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
-            ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
-            ax_speedup.grid(True, alpha=0.3)
-
-            ax_bandwidth.plot(
-                total_tokens_values,
-                cuda_bandwidth_percentages,
-                "ro-",
-                linewidth=3,
-                markersize=8,
-                label="CUDA",
-            )
-            ax_bandwidth.plot(
-                total_tokens_values,
-                triton_bandwidth_percentages,
-                "go-",
-                linewidth=3,
-                markersize=8,
-                label="Triton",
-            )
-            ax_bandwidth.set_title(
-                f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
-                fontsize=12,
-                fontweight="bold",
-            )
-            ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
-            ax_bandwidth.set_ylabel(
-                "% of Peak Bandwidth", fontweight="bold", fontsize=11
-            )
-            ax_bandwidth.legend(prop={"weight": "bold"})
-            ax_bandwidth.grid(True, alpha=0.3)
-
-            # Format x-axis labels for both plots
-            for ax in [ax_speedup, ax_bandwidth]:
-                ax.set_xticks(total_tokens_values)
-                ax.set_xticklabels(
-                    [
-                        f"{tt // 1000}K" if tt >= 1000 else str(tt)
-                        for tt in total_tokens_values
-                    ],
-                    fontweight="bold",
-                )
-                # Make tick labels bold
-                for label in ax.get_xticklabels() + ax.get_yticklabels():
-                    label.set_fontweight("bold")
-
-            # Add value labels on speedup points
-            for x, y in zip(total_tokens_values, ratios):
-                ax_speedup.annotate(
-                    f"{y:.2f}x",
-                    (x, y),
-                    textcoords="offset points",
-                    xytext=(0, 12),
-                    ha="center",
-                    fontsize=10,
-                    fontweight="bold",
-                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
-                )
-
-            # Add value labels on CUDA bandwidth points
-            for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
-                ax_bandwidth.annotate(
-                    f"{y:.1f}%",
-                    (x, y),
-                    textcoords="offset points",
-                    xytext=(0, 12),
-                    ha="center",
-                    fontsize=9,
-                    fontweight="bold",
-                    bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
-                )
-
-            # Add value labels on Triton bandwidth points
-            for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
-                ax_bandwidth.annotate(
-                    f"{y:.1f}%",
-                    (x, y),
-                    textcoords="offset points",
-                    xytext=(0, -15),
-                    ha="center",
-                    fontsize=9,
-                    fontweight="bold",
-                    bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
-                )
-
-    plt.tight_layout()
-    plt.subplots_adjust(top=0.93)  # Make room for main title
-    filename = "silu_benchmark_total_tokens.png"
-    plt.savefig(filename, dpi=300, bbox_inches="tight")
-    plt.show()
-
-    return filename
-
-
-# Create combined plot with all strategies
-combined_plot_filename = create_total_tokens_plot(all_results)
-
-print(f"\n{'=' * 60}")
-print("Benchmark Complete!")
-print(f"Generated combined plot: {combined_plot_filename}")
-print(f"{'=' * 60}")
+for E, T, H in configs:
+    try:
+        time_ms, gflops, gbps = benchmark(E, T, H)
+        print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
+    except Exception:
+        print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -11,13 +11,13 @@ from datetime import datetime
 from typing import Any

 import torch
+import triton
 from tqdm import tqdm

 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    _w8a8_block_fp8_matmul,
 )
 from vllm.platforms import current_platform
-from vllm.triton_utils import triton
 from vllm.utils import FlexibleArgumentParser

 mp.set_start_method("spawn", force=True)
@ -56,7 +56,7 @@ def w8a8_block_matmul(
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization.
                    It should be 2-dim, e.g., [128, 128].
-        output_dtype: The dtype of the returned tensor.
+        output_dytpe: The dtype of the returned tensor.

    Returns:
        torch.Tensor: The result of matmul.
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# fmt: off
 # ruff: noqa: E501
 import time

@ -7,33 +8,27 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    get_col_major_tma_aligned_tensor,
    per_token_group_quant_fp8,
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
-from vllm.utils.deep_gemm import (
-    calc_diff,
-    fp8_gemm_nt,
-    get_col_major_tma_aligned_tensor,
-    per_block_cast_to_fp8,
-)
+from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8


-def benchmark_shape(
-    m: int,
-    n: int,
-    k: int,
-    warmup: int = 100,
-    repeat: int = 10000,
-    verbose: bool = False,
-) -> dict:
+def benchmark_shape(m: int,
+                    n: int,
+                    k: int,
+                    warmup: int = 100,
+                    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
    """Benchmark all implementations for a specific (m, n, k) shape."""
    if verbose:
        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")

    # Create test tensors
-    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
-    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)

    # Reference result in BF16
    torch.cuda.synchronize()
@ -50,39 +45,34 @@ def benchmark_shape(
    # Pre-quantize A for all implementations
    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        A, block_size[1], column_major_scales=True
-    )
+        A, block_size[1], column_major_scales=True)

    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
-        fp8_gemm_nt(
-            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
-        )
+        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
+                                       (B_deepgemm, B_scale_deepgemm),
+                                       C_deepgemm)
        return C_deepgemm

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_triton_block_scaled_mm(
-            A_vllm,
-            B_vllm,
-            A_scale_vllm,
-            B_scale_vllm,
-            block_size,
-            output_dtype=torch.bfloat16,
-        )
+        return w8a8_block_fp8_matmul(A_vllm,
+                                     B_vllm,
+                                     A_scale_vllm,
+                                     B_scale_vllm,
+                                     block_size,
+                                     output_dtype=torch.bfloat16)

    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
-        return ops.cutlass_scaled_mm(
-            A_vllm_cutlass,
-            B_vllm.T,
-            scale_a=A_scale_vllm_cutlass,
-            scale_b=B_scale_vllm.T,
-            out_dtype=torch.bfloat16,
-        )
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
+                                     B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)

    # Run correctness check first
    if verbose:
@ -99,23 +89,26 @@ def benchmark_shape(
        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
-        print(
-            "vLLM Triton vs DeepGEMM difference: "
-            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
-        )
-        print(
-            "vLLM CUTLASS vs DeepGEMM difference: "
-            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
-        )
+        print("vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
+        print("vLLM CUTLASS vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")

    # Benchmark implementations
    implementations = {
        "DeepGEMM": deepgemm_gemm,
        "vLLM Triton": vllm_triton_gemm,
-        "vLLM CUTLASS": vllm_cutlass_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
    }

-    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
+    benchmark_results = {
+        "shape": {
+            "m": m,
+            "n": n,
+            "k": k
+        },
+        "implementations": {}
+    }

    for name, func in implementations.items():
        # Warmup
@ -143,36 +136,38 @@ def benchmark_shape(
            "tflops": tflops,
            "gb_s": gb_s,
            "diff": {
-                "DeepGEMM": 0.0
-                if name == "DeepGEMM"
-                else calc_diff(func(), C_deepgemm),
-                "Reference": deepgemm_diff
-                if name == "DeepGEMM"
-                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
-            },
+                "DeepGEMM":
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
+                "Reference":
+                deepgemm_diff if name == "DeepGEMM" else
+                (vllm_triton_diff
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
+            }
        }

        if verbose:
-            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
+            print(
+                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
+            )

    # Calculate speedups
    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
    for name, data in benchmark_results["implementations"].items():
        if name != "DeepGEMM":
            speedup = baseline / data["time_ms"]
-            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
+            benchmark_results["implementations"][name][
+                "speedup_vs_deepgemm"] = speedup
            if verbose:
-                print(
-                    f"DeepGEMM is {1 / speedup:.2f}x "
-                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
-                )
+                print(f"DeepGEMM is {1/speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")

-    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
-    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
+        "time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
+        "time_ms"]
    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
-    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
-        cutlass_vs_triton
-    )
+    benchmark_results["implementations"]["vLLM CUTLASS"][
+        "speedup_vs_triton"] = cutlass_vs_triton
    if verbose:
        print(
            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -184,7 +179,8 @@ def benchmark_shape(

 def format_table_row(values, widths):
    """Format a row with specified column widths."""
-    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
+    return "| " + " | ".join(f"{val:{w}}"
+                             for val, w in zip(values, widths)) + " |"


 def print_table(headers, rows, title=None):
@ -292,50 +288,38 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["DeepGEMM"]
-        deepgemm_rows.append(
-            [
-                shape["m"],
-                shape["n"],
-                shape["k"],
-                f"{impl_data['time_us']:.1f}",
-                f"{impl_data['tflops']:.1f}",
-                f"{impl_data['gb_s']:.1f}",
-            ]
-        )
+        deepgemm_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
+        ])

-    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
+    print_table(deepgemm_headers,
+                deepgemm_rows,
+                title="DeepGEMM Implementation:")

    # Print vLLM Triton table
-    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
+    triton_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
+    ]
    triton_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["vLLM Triton"]
        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
-        triton_rows.append(
-            [
-                shape["m"],
-                shape["n"],
-                shape["k"],
-                f"{impl_data['time_us']:.1f}",
-                f"{impl_data['tflops']:.1f}",
-                f"{impl_data['gb_s']:.1f}",
-                format_speedup(speedup),
-            ]
-        )
+        triton_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(speedup)
+        ])

-    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
+    print_table(triton_headers,
+                triton_rows,
+                title="vLLM Triton Implementation:")

    # Print vLLM CUTLASS table
    cutlass_headers = [
-        "m",
-        "n",
-        "k",
-        "Time (μs)",
-        "TFLOPS",
-        "GB/s",
-        "vs DeepGEMM",
-        "vs Triton",
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
+        "vs Triton"
    ]
    cutlass_rows = []
    for result in all_results:
@ -343,27 +327,28 @@ def run_benchmarks(verbose: bool = False):
        impl_data = result["implementations"]["vLLM CUTLASS"]
        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
-        cutlass_rows.append(
-            [
-                shape["m"],
-                shape["n"],
-                shape["k"],
-                f"{impl_data['time_us']:.1f}",
-                f"{impl_data['tflops']:.1f}",
-                f"{impl_data['gb_s']:.1f}",
-                format_speedup(vs_deepgemm),
-                format_speedup(vs_triton),
-            ]
-        )
+        cutlass_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(vs_deepgemm),
+            format_speedup(vs_triton)
+        ])

-    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
+    print_table(cutlass_headers,
+                cutlass_rows,
+                title="vLLM CUTLASS Implementation:")

    # Calculate and print averages
    print("\n===== AVERAGE PERFORMANCE =====")

    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
    avg_metrics = {
-        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
+        impl: {
+            "tflops": 0,
+            "gb_s": 0,
+            "time_ms": 0
+        }
+        for impl in implementations
    }

    for result in all_results:
@ -381,9 +366,9 @@ def run_benchmarks(verbose: bool = False):
        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
-        avg_rows.append(
-            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
-        )
+        avg_rows.append([
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
+        ])

    print_table(avg_headers, avg_rows)

@ -391,19 +376,21 @@ def run_benchmarks(verbose: bool = False):
    avg_speedups = {
        "DeepGEMM vs vLLM Triton": 0,
        "DeepGEMM vs vLLM CUTLASS": 0,
-        "vLLM CUTLASS vs vLLM Triton": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
    }

    for result in all_results:
        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
-        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
+            "time_ms"]

-        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
-        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
-        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
-            vllm_triton_time / vllm_cutlass_time
-        )
+        avg_speedups[
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups[
+            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time

    print("\n===== AVERAGE SPEEDUPS =====")
    speedup_headers = ["Comparison", "Speedup"]
@ -421,7 +408,8 @@ def run_benchmarks(verbose: bool = False):

    for result in all_results:
        for impl in implementations:
-            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
+            avg_diff[impl] += result["implementations"][impl]["diff"][
+                "Reference"]

    diff_headers = ["Implementation", "Avg Diff vs Reference"]
    diff_rows = []
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@ -55,107 +55,6 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 ----------------------------------------------------------------------------------------------------
 ```

-### JSON configuration file for synthetic conversations generation
-
-The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
-When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
-
-The file `generate_multi_turn.json` is an example file.
-
-The file must contain the sections `prompt_input` and `prompt_output`.
-
-The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
-
-* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
-The final value will always be rounded to an even number so each user turn has a reply.
-* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
-* `num_tokens` - Total token length of each **user** message (one turn).
-
-The `prompt_output` section must contain `num_tokens`:
-
-* `num_tokens` - Total token length of each **assistant** message (one turn).
-
-### Random distributions for synthetic conversations generation
-
-When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
-every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
-The distribution determines how to randomly sample values for the field.
-
-The available distributions are listed below.
-
-**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
-Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
-
-#### constant
-
-```json
-{
-    "distribution": "constant",
-    "value": 500
-}
-```
-
-* `value` - the fixed integer value (always returns the same number).
-
-#### uniform
-
-```json
-{
-    "distribution": "uniform",
-    "min": 12,
-    "max": 18
-}
-```
-
-* `min` - minimum value (inclusive).
-* `max` - maximum value (inclusive), should be equal or larger than min.
-
-#### lognormal
-
-```json
-{
-    "distribution": "lognormal",
-    "average": 1000,
-    "max": 5000
-}
-```
-
-You can parameterize the lognormal distribution in one of two ways:
-
-Using the average and optional median ratio:
-
-* `average` - target average value of the distribution.
-* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
-
-Using the parameters of the underlying normal distribution:
-
-* `mean` - mean of the underlying normal distribution.
-* `sigma` - standard deviation of the underlying normal distribution.
-
-#### zipf
-
-```json
-{
-    "distribution": "zipf",
-    "alpha": 1.2,
-    "max": 100
-}
-```
-
-* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
-
-#### poisson
-
-```json
-{
-    "distribution": "poisson",
-    "alpha": 10,
-    "max": 50
-}
-```
-
-* `alpha` - expected value (λ). Also the variance of the distribution.
-
 ## ShareGPT Conversations

 To run with the ShareGPT data, download the following ShareGPT dataset:
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@ -99,105 +99,21 @@ class PoissonDistribution(Distribution):

 class LognormalDistribution(Distribution):
    def __init__(
-        self,
-        mean: Optional[float] = None,
-        sigma: Optional[float] = None,
-        average: Optional[int] = None,
-        median_ratio: Optional[float] = None,
-        max_val: Optional[int] = None,
+        self, mean: float, sigma: float, max_val: Optional[int] = None
    ) -> None:
-        self.average = average
-        self.median_ratio = median_ratio
-        self.max_val = max_val
-
-        if average is not None:
-            if average < 1:
-                raise ValueError("Lognormal average must be positive")
-
-            if mean or sigma:
-                raise ValueError(
-                    "When using lognormal average, you can't provide mean/sigma"
-                )
-
-            if self.median_ratio is None:
-                # Default value that provides relatively wide range of values
-                self.median_ratio = 0.85
-
-            # Calculate mean/sigma of np.random.lognormal based on the average
-            mean, sigma = self._generate_lognormal_by_median(
-                target_average=self.average, median_ratio=self.median_ratio
-            )
-        else:
-            if mean is None or sigma is None:
-                raise ValueError(
-                    "Must provide both mean and sigma if average is not used"
-                )
-
-            if mean <= 0 or sigma < 0:
-                raise ValueError(
-                    "Lognormal mean must be positive and sigma must be non-negative"
-                )
-
-        # Mean and standard deviation of the underlying normal distribution
-        # Based on numpy.random.lognormal
        self.mean = mean
        self.sigma = sigma
-
-    @staticmethod
-    def _generate_lognormal_by_median(
-        target_average: int, median_ratio: float
-    ) -> tuple[float, float]:
-        """
-        Compute (mu, sigma) for a lognormal distribution given:
-        - a target average (mean of the distribution)
-        - a ratio of median / mean (controls skewness), assume mean > median
-
-        Background:
-        If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
-        * mean(X)   = exp(mu + sigma^2 / 2)
-        * median(X) = exp(mu)
-
-        So:
-        median / mean = exp(mu) / exp(mu + sigma^2 / 2)
-                      = exp(-sigma^2 / 2)
-
-        Rearranging:
-        sigma^2 = 2 * ln(mean / median)
-        mu      = ln(median)
-
-        This gives a unique (mu, sigma) for any valid mean and median.
-        """
-        # Check input validity: median must be smaller than mean
-        if median_ratio <= 0 or median_ratio >= 1:
-            raise ValueError("median_ratio must be in range (0, 1)")
-
-        target_median = target_average * median_ratio
-
-        # Solve sigma^2 = 2 * ln(mean / median)
-        sigma = np.sqrt(2 * np.log(target_average / target_median))
-        mu = np.log(target_median)
-
-        return mu, sigma
+        self.max_val = max_val

    def sample(self, size: int = 1) -> np.ndarray:
        samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
-
-        if self.average is not None:
-            # Scale to average
-            samples *= self.average / samples.mean()
-
        if self.max_val:
            samples = np.minimum(samples, self.max_val)

        return np.round(samples).astype(int)

    def __repr__(self) -> str:
-        if self.average:
-            return (
-                f"LognormalDistribution[{self.average}, "
-                f"{self.median_ratio}, {self.max_val}]"
-            )
-        return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
+        return f"LognormalDistribution[{self.mean}, {self.sigma}]"


 class GenConvArgs(NamedTuple):
@ -257,21 +173,10 @@ def get_random_distribution(
        return PoissonDistribution(conf["alpha"], max_val=max_val)

    elif distribution == "lognormal":
-        max_val = conf.get("max", None)
-
-        if "average" in conf:
-            # Infer lognormal mean/sigma (numpy) from input average
-            median_ratio = conf.get("median_ratio", None)
-            return LognormalDistribution(
-                average=conf["average"], median_ratio=median_ratio, max_val=max_val
-            )
-
-        # Use mean/sigma directly (for full control over the distribution)
        verify_field_exists(conf, "mean", section, subsection)
        verify_field_exists(conf, "sigma", section, subsection)
-        return LognormalDistribution(
-            mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
-        )
+        max_val = conf.get("max", None)
+        return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)

    elif distribution == "uniform":
        verify_field_exists(conf, "min", section, subsection)
--- a/benchmarks/multi_turn/generate_multi_turn.json
+++ b/benchmarks/multi_turn/generate_multi_turn.json
@ -15,8 +15,9 @@
        },
        "prefix_num_tokens": {
            "distribution": "lognormal",
-            "average": 1000,
-            "max": 5000
+            "mean": 6,
+            "sigma": 4,
+            "max": 1500
        },
        "num_tokens": {
            "distribution": "uniform",
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@ -0,0 +1,49 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["vllm"]
+
+[tool.ruff.format]
+docstring-code-format = true
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -101,7 +101,6 @@ else()
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
-    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
 endif()

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
@ -178,14 +177,8 @@ elseif (S390_FOUND)
        "-mzvector"
        "-march=native"
        "-mtune=native")
-elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
-    if(RVV_FOUND)
-	    message(FAIL_ERROR "Can't support rvv now.")
-    else()
-        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
-    endif()
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
 endif()

 #
@ -213,7 +206,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
-        add_compile_definitions(VLLM_USE_ACL)
    endif()

    set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -227,7 +219,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(ONEDNN_VERBOSE "ON")
+    set(ONEDNN_VERBOSE "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

    FetchContent_MakeAvailable(oneDNN)
@ -266,8 +258,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp"
-    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
+    "csrc/cpu/torch_bindings.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
 else()
  FetchContent_Declare(
        flashmla
-        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
+        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
+        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -33,64 +33,23 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
-
-set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
-    list(APPEND SUPPORT_ARCHS 9.0a)
-endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
-    list(APPEND SUPPORT_ARCHS 10.0a)
-endif()
-
-
-cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
-if(FLASH_MLA_ARCHS)
-    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
-    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
-
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
    set(FlashMLA_SOURCES
-        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
-        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
-    )
-
-    set(FlashMLA_Extension_SOURCES
-        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
-        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
-    )
+        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)

    set(FlashMLA_INCLUDES
-        ${flashmla_SOURCE_DIR}/csrc
-        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
-    )
-
-    set(FlashMLA_Extension_INCLUDES
-        ${flashmla_SOURCE_DIR}/csrc
-        ${flashmla_SOURCE_DIR}/csrc/sm90
-        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
-        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
-    )
+        ${flashmla_SOURCE_DIR}/csrc)

    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")

-    set_gencode_flags_for_srcs(
-        SRCS "${FlashMLA_Extension_SOURCES}"
-        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
-
    define_gpu_extension_target(
        _flashmla_C
        DESTINATION vllm
@ -101,32 +60,8 @@ if(FLASH_MLA_ARCHS)
        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
        USE_SABI 3
        WITH_SOABI)
-
-    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
-    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
-    target_compile_options(_flashmla_C PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
-        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
-
-    define_gpu_extension_target(
-        _flashmla_extension_C
-        DESTINATION vllm
-        LANGUAGE ${VLLM_GPU_LANG}
-        SOURCES ${FlashMLA_Extension_SOURCES}
-        COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
-        ARCHITECTURES ${VLLM_GPU_ARCHES}
-        INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
-        USE_SABI 3
-        WITH_SOABI)
-
-    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
-    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
-    target_compile_options(_flashmla_extension_C PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
-        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create empty targets for setup.py when not targeting sm90a systems
+    # Create an empty target for setup.py when not targeting sm90a systems
    add_custom_target(_flashmla_C)
-    add_custom_target(_flashmla_extension_C)
 endif()

--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -16,7 +16,7 @@ import shutil

 from torch.utils.hipify.hipify_python import hipify

-if __name__ == "__main__":
+if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    # Project directory where all the source + include files live.
@ -34,14 +34,15 @@ if __name__ == "__main__":
    )

    # Source files to convert.
-    parser.add_argument(
-        "sources", help="Source files to hipify.", nargs="*", default=[]
-    )
+    parser.add_argument("sources",
+                        help="Source files to hipify.",
+                        nargs="*",
+                        default=[])

    args = parser.parse_args()

    # Limit include scope to project_dir only
-    includes = [os.path.join(args.project_dir, "*")]
+    includes = [os.path.join(args.project_dir, '*')]

    # Get absolute path for all source files.
    extra_files = [os.path.abspath(s) for s in args.sources]
@ -50,31 +51,25 @@ if __name__ == "__main__":
    # The directory might already exist to hold object files so we ignore that.
    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)

-    hipify_result = hipify(
-        project_directory=args.project_dir,
-        output_directory=args.output_dir,
-        header_include_dirs=[],
-        includes=includes,
-        extra_files=extra_files,
-        show_detailed=True,
-        is_pytorch_extension=True,
-        hipify_extra_files_only=True,
-    )
+    hipify_result = hipify(project_directory=args.project_dir,
+                           output_directory=args.output_dir,
+                           header_include_dirs=[],
+                           includes=includes,
+                           extra_files=extra_files,
+                           show_detailed=True,
+                           is_pytorch_extension=True,
+                           hipify_extra_files_only=True)

    hipified_sources = []
    for source in args.sources:
        s_abs = os.path.abspath(source)
-        hipified_s_abs = (
-            hipify_result[s_abs].hipified_path
-            if (
-                s_abs in hipify_result
-                and hipify_result[s_abs].hipified_path is not None
-            )
-            else s_abs
-        )
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
+                          (s_abs in hipify_result
+                           and hipify_result[s_abs].hipified_path is not None)
+                          else s_abs)
        hipified_sources.append(hipified_s_abs)

-    assert len(hipified_sources) == len(args.sources)
+    assert (len(hipified_sources) == len(args.sources))

    # Print hipified source files.
    print("\n".join(hipified_sources))
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
  list(REMOVE_DUPLICATES _PTX_ARCHS)
  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)

-  # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
-  # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
+  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
  set(_CUDA_ARCHS)
  foreach(_arch ${_SRC_CUDA_ARCHS})
-    if(_arch MATCHES "[af]$")
+    if(_arch MATCHES "\\a$")
      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
-      string(REGEX REPLACE "[af]$" "" _base "${_arch}")
+      string(REPLACE "a" "" _base "${_arch}")
      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
        list(APPEND _CUDA_ARCHS "${_arch}")
@ -480,6 +480,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
  endif()

+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)

  target_compile_options(${GPU_MOD_NAME} PRIVATE
    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
--- a/csrc/attention/mla/cutlass_mla_entry.cu
+++ b/csrc/attention/mla/cutlass_mla_entry.cu
@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
+void cutlass_mla_decode_sm100a(torch::Tensor const& out,
+                               torch::Tensor const& q_nope,
+                               torch::Tensor const& q_pe,
+                               torch::Tensor const& kv_c_and_k_pe_cache,
+                               torch::Tensor const& seq_lens,
+                               torch::Tensor const& page_table, double scale);
+#endif
+
+void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
+                        torch::Tensor const& q_pe,
+                        torch::Tensor const& kv_c_and_k_pe_cache,
+                        torch::Tensor const& seq_lens,
+                        torch::Tensor const& page_table, double scale) {
+#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
+  return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                   seq_lens, page_table, scale);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
+}
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.h"
+
+#include "cutlass_extensions/common.hpp"
+
+#include "device/sm100_mla.hpp"
+#include "kernel/sm100_mla_tile_scheduler.hpp"
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+template <typename T, bool PersistenceOption = true>
+struct MlaSm100 {
+  using Element = T;
+  using ElementAcc = float;
+  using ElementOut = T;
+
+  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
+  using TileShapeH = cute::tuple_element_t<0, TileShape>;
+  using TileShapeD = cute::tuple_element_t<2, TileShape>;
+
+  // H K (D_latent D_rope) B
+  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
+
+  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
+  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
+  using StrideO = StrideK;                            // H D B
+  using StrideLSE = cute::tuple<_1, int>;             // H B
+
+  using TileScheduler =
+      std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
+                         Sm100MlaIndividualTileScheduler>;
+
+  using FmhaKernel =
+      cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
+          TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
+          /*kIsCpAsync=*/true>;
+  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
+};
+
+template <typename T>
+typename T::Fmha::Arguments args_from_options(
+    at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
+    at::Tensor const& page_table, double scale) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = q_nope.device().index();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  int batches = q_nope.sizes()[0];
+  int page_count_per_seq = page_table.sizes()[1];
+  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
+  int page_size = kv_c_and_k_pe_cache.sizes()[1];
+  int max_seq_len = page_size * page_count_per_seq;
+  using TileShapeH = typename T::TileShapeH;
+  using TileShapeD = typename T::TileShapeD;
+  auto problem_shape =
+      cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+
+  auto [H, K, D, B] = problem_shape;
+  auto [D_latent, D_rope] = D;
+
+  using StrideQ = typename T::StrideQ;
+  using StrideK = typename T::StrideK;
+  using StrideO = typename T::StrideO;
+  using StrideLSE = typename T::StrideLSE;
+
+  StrideQ stride_Q_latent = cute::make_tuple(
+      static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
+  StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
+                                           static_cast<int64_t>(H * D_rope));
+  StrideK stride_C =
+      cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
+                       static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
+  StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
+  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
+                                      static_cast<int64_t>(H * D_latent));
+
+  using Element = typename T::Element;
+  using ElementOut = typename T::ElementOut;
+  using ElementAcc = typename T::ElementAcc;
+  auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
+  auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
+  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
+  auto scale_f = static_cast<float>(scale);
+  typename T::Fmha::Arguments arguments{
+      problem_shape,
+      {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
+       stride_C, C_ptr + D_latent, stride_C,
+       static_cast<int*>(seq_lens.data_ptr()),
+       static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
+       page_size},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
+       static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      1,        // split_kv
+      nullptr,  // is_var_split_kv
+  };
+  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
+  // split_kv automatically based on batch size and sequence length to balance
+  // workload across available SMs. Consider using var_split_kv for manual
+  // control if needed.
+  T::Fmha::set_split_kv(arguments);
+  return arguments;
+}
+
+template <typename Element>
+void runMla(at::Tensor const& out, at::Tensor const& q_nope,
+            at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
+            at::Tensor const& seq_lens, at::Tensor const& page_table,
+            float scale, cudaStream_t stream) {
+  using MlaSm100Type = MlaSm100<Element>;
+  typename MlaSm100Type::Fmha fmha;
+  auto arguments = args_from_options<MlaSm100Type>(
+      out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
+  size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(fmha.can_implement(arguments));
+
+  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
+}
+
+void cutlass_mla_decode_sm100a(torch::Tensor const& out,
+                               torch::Tensor const& q_nope,
+                               torch::Tensor const& q_pe,
+                               torch::Tensor const& kv_c_and_k_pe_cache,
+                               torch::Tensor const& seq_lens,
+                               torch::Tensor const& page_table, double scale) {
+  TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
+  TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
+  TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
+  TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
+              "kv_c_and_k_pe_cache must be a 3D tensor");
+  TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
+  TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
+  TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
+
+  auto B_q_nope = q_nope.size(0);
+  auto H_q_nope = q_nope.size(1);
+  auto D_q_nope = q_nope.size(2);
+  auto B_q_pe = q_pe.size(0);
+  auto H_q_pe = q_pe.size(1);
+  auto D_q_pe = q_pe.size(2);
+  auto B_pt = page_table.size(0);
+  auto PAGE_NUM = page_table.size(1);
+  auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
+  auto D_ckv = kv_c_and_k_pe_cache.size(2);
+  auto B_o = out.size(0);
+  auto H_o = out.size(1);
+  auto D_o = out.size(2);
+
+  TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
+  TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
+  TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
+  TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
+              "H_q_nope, H_q_pe, and H_o must be equal to 128");
+  TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
+              "PAGE_SIZE must be a power of 2");
+  TORCH_CHECK(
+      B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
+      "Batch dims must be same for page_table, q_nope and q_pe, and out");
+  TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
+              "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
+  TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
+
+  TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
+                  q_nope.dtype() == at::ScalarType::BFloat16 ||
+                  q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
+              "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
+  TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
+                  q_nope.dtype() == q_pe.dtype(),
+              "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
+  TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
+              "seq_lens must be a 32-bit integer tensor");
+  TORCH_CHECK(page_table.dtype() == torch::kInt32,
+              "page_table must be a 32-bit integer tensor");
+
+  auto in_dtype = q_nope.dtype();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(q_nope.get_device());
+  if (in_dtype == at::ScalarType::Half) {
+    runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
+                            page_table, scale, stream);
+  } else if (in_dtype == at::ScalarType::BFloat16) {
+    runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                seq_lens, page_table, scale, stream);
+  } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
+    runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                  seq_lens, page_table, scale, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported input data type of MLA");
+  }
+}
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -133,14 +133,6 @@ public:
    // printf("    sm_count = %d\n", sm_count);
    int max_splits = ceil_div(K, 128);
    max_splits = min(16, max_splits);
-
-    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 1 kv_splits. 
-    // Discuss with NVIDIA how this can be fixed.
-    if (B > 1) {
-      max_splits = min(1, max_splits);
-    }
-    
    // printf("    max_splits = %d\n", max_splits);
    int sms_per_batch = max(1, sm_count / B);
    // printf("    sms_per_batch = %d\n", sms_per_batch);
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
        auto blk_coord = tile_scheduler.get_block_coord();
        auto problem_shape = params.problem_shape;
-        auto local_split_kv = params.split_kv;
+	auto local_split_kv = params.split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-          if (params.ptr_split_kv != nullptr) {
+	  if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
        }
-        if (local_split_kv <= get<3>(blk_coord))
-          continue;
+	if (local_split_kv <= get<3>(blk_coord))
+	  continue;
        load_page_table(
          blk_coord,
          problem_shape,
          params.mainloop,
          shared_storage.tensors,
          pipeline_page_table, pipeline_pt_producer_state,
-          local_split_kv
+	  local_split_kv
        );
      }
    }
@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        CUTLASS_PRAGMA_NO_UNROLL
        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
          auto blk_coord = tile_scheduler.get_block_coord();
-          auto problem_shape = params.problem_shape;
-          auto local_split_kv = params.split_kv;
+	  auto problem_shape = params.problem_shape;
+	  auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-            if (params.ptr_split_kv != nullptr) {
+	    if (params.ptr_split_kv != nullptr) {
              local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
          }
-          if (local_split_kv <= get<3>(blk_coord))
+	  if (local_split_kv <= get<3>(blk_coord))
            continue;
          load_cpasync(
            blk_coord,
@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
            params.mainloop_params,
            shared_storage.tensors,
            pipeline_load_qk, pipeline_load_qk_producer_state,
-            local_split_kv,
+	    local_split_kv,
            /* must be shared pipe */
            pipeline_page_table, pipeline_pt_consumer_state
          );
@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          CUTLASS_PRAGMA_NO_UNROLL
          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
            auto blk_coord = tile_scheduler.get_block_coord();
-            auto problem_shape = params.problem_shape;
-            auto local_split_kv = params.split_kv;
+	    auto problem_shape = params.problem_shape;
+	    auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-              if (params.ptr_split_kv != nullptr) {
-                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
-              }
+	      if (params.ptr_split_kv != nullptr) {
+	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+	      }
            }
-            if (local_split_kv <= get<3>(blk_coord))
+	    if (local_split_kv <= get<3>(blk_coord))
              continue;
            load_tma</* paged= */ true>(
              blk_coord,
@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
              shared_storage.tensors,
              pipeline_load_qk, pipeline_load_qk_producer_state,
              pipeline_load_qk, pipeline_load_qk_producer_state,
-              local_split_kv
+	      local_split_kv
            );
            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
          }
@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          CUTLASS_PRAGMA_NO_UNROLL
          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
            auto blk_coord = tile_scheduler.get_block_coord();
-            auto problem_shape = params.problem_shape;
-            auto local_split_kv = params.split_kv;
+	    auto problem_shape = params.problem_shape;
+	    auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-              if (params.ptr_split_kv != nullptr) {
+	      if (params.ptr_split_kv != nullptr) {
                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
-              }
+	      }
            }
-            if (local_split_kv <= get<3>(blk_coord))
+	    if (local_split_kv <= get<3>(blk_coord))
              continue;
            load_tma<false>(
              blk_coord,
@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
              shared_storage.tensors,
              pipeline_load_qk, pipeline_load_qk_producer_state,
              pipeline_load_qk, pipeline_load_qk_producer_state,
-              local_split_kv
+	      local_split_kv
            );
            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
          }
@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
          auto blk_coord = tile_scheduler.get_block_coord();
          auto problem_shape = params.problem_shape;
-          auto local_split_kv = params.split_kv;
+	  auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
            if (params.ptr_split_kv != nullptr) {
                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
          }
-          if (local_split_kv <= get<3>(blk_coord))
+	  if (local_split_kv <= get<3>(blk_coord))
            continue;
          mma(blk_coord,
            problem_shape,
@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
            pipeline_mma_s, pipeline_mma_s_producer_state,
            pipeline_p_mma, pipeline_p_mma_consumer_state,
            pipeline_mma_o, pipeline_mma_o_producer_state,
-            local_split_kv
+	    local_split_kv
          );
        }
      }
@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
        auto blk_coord = tile_scheduler.get_block_coord();
        auto problem_shape = params.problem_shape;
-        auto split_kv = params.split_kv;
-        auto local_split_kv = split_kv;
+	auto split_kv = params.split_kv;
+	auto local_split_kv = split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-          if (params.ptr_split_kv != nullptr) {
+	  if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
        }
-        if (local_split_kv <= get<3>(blk_coord))
+	if (local_split_kv <= get<3>(blk_coord))
          continue;
        compute(
          blk_coord,
@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          pipeline_mma_s, pipeline_mma_s_consumer_state,
          pipeline_p_mma, pipeline_p_mma_producer_state,
          pipeline_mma_o, pipeline_mma_o_consumer_state,
-          local_split_kv
+	  local_split_kv
        );
      }

@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      cutlass::arch::NamedBarrier(
          (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
          kNamedBarrierEpilogue
-      ).arrive_and_wait();
+      ).arrive();

      return;
    }
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -56,11 +56,3 @@ void cp_gather_cache(
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
-
-// Indexer K quantization and cache function
-void indexer_k_quant_and_cache(
-    torch::Tensor& k,             // [num_tokens, head_dim]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
-    torch::Tensor& slot_mapping,  // [num_tokens]
-    int64_t quant_block_size,     // quantization block size
-    const std::string& scale_fmt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -16,7 +16,8 @@

 #include <algorithm>
 #include <cassert>
-#include <cfloat>
+#include <map>
+#include <vector>

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -208,20 +209,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,

 namespace vllm {

-// Used to copy/convert one element
-template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
-struct CopyWithScaleOp {
-  float scale;
-
-  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      dst = static_cast<OutT>(src);
-    } else {
-      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
-    }
-  }
-};
-
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -237,51 +224,59 @@ __global__ void reshape_and_cache_kernel(
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  if (slot_idx < 0) {
+    // Padding token that should be ignored.
    return;
  }

  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
-  const int h_block_count = head_size / x;  // head_size//x

-  const int h_block_idx = threadIdx.x;
-  if (h_block_idx >= num_heads * h_block_count) {
-    return;
-  }
+  const int n = num_heads * head_size;
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+    const int64_t src_key_idx = token_idx * key_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;

-  const int head_idx = h_block_idx / h_block_count;
-  const int h_block = h_block_idx % h_block_count;
+    const int head_idx = i / head_size;
+    const int head_offset = i % head_size;
+    const int x_idx = head_offset / x;
+    const int x_offset = head_offset % x;

-  const scalar_t* __restrict__ key_src =
-      key + token_idx * key_stride + head_idx * head_size + h_block * x;
-  const int64_t src_value_start =
-      token_idx * value_stride + head_idx * head_size + h_block * x;
-
-  cache_t* __restrict__ key_dst =
-      key_cache + block_idx * num_heads * h_block_count * block_size * x +
-      head_idx * h_block_count * block_size * x + h_block * block_size * x +
-      block_offset * x;
-  const int64_t tgt_value_start =
-      block_idx * num_heads * h_block_count * x * block_size +
-      head_idx * h_block_count * x * block_size + h_block * x * block_size +
-      block_offset;
-
-  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
-  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
-  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
-
-  vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
-
-  const scalar_t* __restrict__ value_src = value + src_value_start;
-  cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
-#pragma unroll
-  for (int i = 0; i < x; i++) {
-    v_op(value_dst[i * block_size], value_src[i]);
+    const int64_t tgt_key_idx =
+        block_idx * num_heads * (head_size / x) * block_size * x +
+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
+        block_offset * x + x_offset;
+    const int64_t tgt_value_idx =
+        block_idx * num_heads * head_size * block_size +
+        head_idx * head_size * block_size + head_offset * block_size +
+        block_offset;
+    scalar_t tgt_key = key[src_key_idx];
+    scalar_t tgt_value = value[src_value_idx];
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      key_cache[tgt_key_idx] = tgt_key;
+      value_cache[tgt_value_idx] = tgt_value;
+    } else {
+      key_cache[tgt_key_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
+      value_cache[tgt_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+    }
  }
 }

+// Used by vectorization_utils to copy/convert one element
+template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
+struct CopyWithScaleOp {
+  float scale;
+
+  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      dst = static_cast<OutT>(src);
+    } else {
+      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+    }
+  }
+};
+
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -401,177 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void concat_and_cache_ds_mla_kernel(
-    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
-    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                     // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size,                      //
-    const float* scale                         //
-) {
-  const int64_t token_idx = blockIdx.x;
-  const int64_t slot_idx = slot_mapping[token_idx];
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0) {
-    return;
-  }
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-  const int64_t dst_idx_start =
-      block_idx * block_stride + block_offset * entry_stride;
-
-  // For the NoPE part, each tile of 128 elements is handled by half of one warp
-  // (16 threads). There are 4 total tiles, so 2 warps (64 threads).
-  // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
-  // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
-  // So in total, we use 3 warps (96 threads) per block.
-
-  // Cast kv_cache to 16_bit for RoPE values
-  scalar_t* kv_cache_16bit =
-      reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
-
-  // The last warp handles the RoPE part
-  if (threadIdx.x >= 64) {
-    // Each thread handles two elements of RoPE
-    const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
-    const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
-    // Vectorized load of two 16-bit values, performed as one 32-bit load
-    const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
-    // RoPE values start after the packed 8-bit NoPE values and the
-    // 32-bit scales
-    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
-    // Vectorized store of two 16-bit values, performed as one 32-bit store
-    *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
-    return;
-  }
-
-  // The first two warps handle the NoPE part
-  const int8_t warp_idx = threadIdx.x >> 5;
-  const int8_t lane_idx = threadIdx.x & 31;
-  const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
-
-  // Each thread handles 8 elements of NoPE
-  // Load the NoPE elements for this thread into registers
-  const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
-  // Vectorized load of eight 16-bit values, performed as an int4 load
-  const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
-  const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
-
-  // Max absolute value of this thread's elements
-  float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
-                              fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
-                        fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
-                              fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
-
-  // Warp-level reduction to find the max absolute value in each half-warp
-#pragma unroll
-  for (int offset = 8; offset > 0; offset /= 2) {
-    max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
-  }
-
-  // Compute the scale for the tile
-  float tile_scale = max_abs / 448.f;
-  tile_scale = fmaxf(tile_scale, FLT_MIN);
-
-  // The first lane of each half-warp writes the scale to kv_cache
-  if ((lane_idx == 0) || (lane_idx == 16)) {
-    float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
-    const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
-    kv_cache_32bit[dst_idx] = tile_scale;
-  }
-
-  // Now all threads in the block scale and write their elements
-  // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
-  const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
-
-  uint8_t result[8];
-#pragma unroll
-  for (int i = 0; i < 8; i++) {
-    result[i] =
-        fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
-            vals[i], tile_scale);
-  }
-
-  // Store as aligned 64-bit writes
-  *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
-      *reinterpret_cast<const uint64_t*>(result);
-}
-
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void indexer_k_quant_and_cache_kernel(
-    const scalar_t* __restrict__ k,  // [num_tokens, head_dim]
-    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, cache_stride]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int head_dim,                        // dimension of each head
-    const int quant_block_size,                // quantization block size
-    const int cache_block_size,                // cache block size
-    const int cache_stride,  // stride for each token in kv_cache
-    const bool use_ue8m0     // use ue8m0 scale format
-) {
-  constexpr int VEC_SIZE = 4;
-  const int64_t token_idx = blockIdx.x;
-  const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x +
-                                threadIdx.y * blockDim.x + threadIdx.x) *
-                               VEC_SIZE;
-  const int64_t slot_idx = slot_mapping[token_idx];
-  const int64_t block_idx = slot_idx / cache_block_size;
-  const int64_t block_offset = slot_idx % cache_block_size;
-
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0 || (head_dim_idx >= head_dim)) {
-    return;
-  }
-
-  float2 k_val = (reinterpret_cast<const float2*>(
-      k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE];
-  scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val);
-  float amax = 0.0f;
-  for (int i = 0; i < VEC_SIZE; i++) {
-    amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
-  }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif
-
-  // Reduced amax
-  for (int mask = 16; mask > 0; mask /= 2) {
-#ifdef USE_ROCM
-    amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask));
-#else
-    amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
-#endif
-  }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif
-  float scale = fmaxf(amax, 1e-4) / 448.0f;
-  if (use_ue8m0) {
-    scale = exp2f(ceilf(log2f(scale)));
-  }
-
-  const int64_t dst_offset = block_idx * cache_block_size * cache_stride +
-                             block_offset * head_dim + head_dim_idx;
-  for (int i = 0; i < VEC_SIZE; i++) {
-    kv_cache[dst_offset + i] =
-        fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale);
-  }
-  if (threadIdx.x == 0) {
-    const int64_t dst_scale_idx =
-        block_idx * cache_block_size * cache_stride +
-        cache_block_size * head_dim +
-        (block_offset * head_dim + head_dim_idx) * 4 / quant_block_size;
-    reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale;
-  }
-}
-
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@ -607,15 +431,14 @@ void reshape_and_cache(

  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
-  int head_div_x = head_size / x;

  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_div_x, 512));
+  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
-                             CALL_RESHAPE_AND_CACHE);
+                             CALL_RESHAPE_AND_CACHE)
 }

 // KV_T is the data type of key and value tensors.
@ -686,18 +509,6 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

-// KV_T is the data type of key and value tensors.
-// CACHE_T is the stored data type of kv-cache.
-#define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE)           \
-  vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                     \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
-          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
-          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
-          reinterpret_cast<const float*>(scale.data_ptr()));
-
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -720,43 +531,20 @@ void concat_and_cache_mla(
  int pe_dim = k_pe.size(1);
  int block_size = kv_cache.size(1);

-  if (kv_cache_dtype == "fp8_ds_mla") {
-    TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla");
-    TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla");
-    TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(),
-                "kv_cache.size(2) must be 656 bytes for fp8_ds_mla");
-    TORCH_CHECK(kv_c.itemsize() == 2,
-                "kv_c.itemsize() must be 2 for fp8_ds_mla");
-    TORCH_CHECK(k_pe.itemsize() == 2,
-                "k_pe.itemsize() must be 2 for fp8_ds_mla");
-  } else {
-    TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-  }
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);

  int kv_c_stride = kv_c.stride(0);
  int k_pe_stride = k_pe.stride(0);
  int block_stride = kv_cache.stride(0);
  int entry_stride = kv_cache.stride(1);

+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  if (kv_cache_dtype == "fp8_ds_mla") {
-    dim3 grid(num_tokens);
-    // For the NoPE part, each tile of 128 elements is handled by half of one
-    // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
-    // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
-    // The RoPE part (last 64 elements) is handled by another 1 warp (32
-    // threads). So in total, we use 3 warps (96 threads) per block.
-    dim3 block(96);
-    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                               CALL_CONCAT_AND_CACHE_DS_MLA);
-  } else {
-    dim3 grid(num_tokens);
-    dim3 block(std::min(kv_lora_rank, 512));
-    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                               CALL_CONCAT_AND_CACHE_MLA);
-  }
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CONCAT_AND_CACHE_MLA);
 }

 namespace vllm {
@ -1134,42 +922,3 @@ void cp_gather_cache(
    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
  }
 }
-
-// Macro to dispatch the kernel based on the data type.
-#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \
-  vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \
-      <<<grid, block, 0, stream>>>(                                     \
-          reinterpret_cast<KV_T*>(k.data_ptr()),                        \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
-          slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \
-          cache_block_size, cache_stride, use_ue8m0);
-
-void indexer_k_quant_and_cache(
-    torch::Tensor& k,             // [num_tokens, head_dim]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
-    torch::Tensor& slot_mapping,  // [num_tokens]
-    int64_t quant_block_size,     // quantization block size
-    const std::string& scale_fmt) {
-  int num_tokens = k.size(0);
-  int head_dim = k.size(1);
-  int cache_block_size = kv_cache.size(1);
-  int cache_stride = kv_cache.size(2);
-  bool use_ue8m0 = scale_fmt == "ue8m0";
-
-  TORCH_CHECK(k.device() == kv_cache.device(),
-              "k and kv_cache must be on the same device");
-  TORCH_CHECK(k.device() == slot_mapping.device(),
-              "k and slot_mapping must be on the same device");
-  TORCH_CHECK(head_dim % quant_block_size == 0,
-              "head_dim must be divisible by quant_block_size");
-
-  constexpr int vec_size = 4;
-  dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) /
-                            (quant_block_size * vec_size));
-  dim3 block(32, vec_size);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
-                             CALL_INDEXER_K_QUANT_AND_CACHE);
-}
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -1,16 +0,0 @@
-#pragma once
-#include <cstdlib>
-#include <string>
-#include <cctype>
-
-namespace vllm {
-
-// vllm_kernel_override_batch_invariant(); returns true
-// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
-inline bool vllm_kernel_override_batch_invariant() {
-  std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
-  const char* val = std::getenv(env_key.c_str());
-  return (val && std::atoi(val) != 0) ? 1 : 0;
-}
-
-}  // namespace vllm
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@ -14,12 +14,7 @@
  // arm implementation
  #include "cpu_types_arm.hpp"
 #else
-  #warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
-  #include "cpu_types_scalar.hpp"
-#endif
-
-#ifdef _OPENMP
-  #include <omp.h>
+  #warning "unsupported vLLM cpu implementation"
 #endif

 #endif
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@ -1,513 +0,0 @@
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-#include <torch/all.h>
-#include "float_convert.hpp"
-
-namespace vec_op {
-
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
-  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-
-#ifndef CPU_OP_GUARD
-  #define CPU_KERNEL_GUARD_IN(NAME)
-  #define CPU_KERNEL_GUARD_OUT(NAME)
-#else
-  #define CPU_KERNEL_GUARD_IN(NAME) \
-    std::cout << #NAME << " invoked." << std::endl;
-  #define CPU_KERNEL_GUARD_OUT(NAME) \
-    std::cout << #NAME << " exit." << std::endl;
-#endif
-
-#define FORCE_INLINE __attribute__((always_inline)) inline
-
-#define __max(a, b) ((a) > (b) ? (a) : (b))
-#define __min(a, b) ((a) < (b) ? (a) : (b))
-#define __abs(a) ((a) < (0) ? (0 - a) : (a))
-
-typedef struct f16x8_t {
-  uint16_t val[8];
-} f16x8_t;
-
-typedef struct f16x16_t {
-  uint16_t val[16];
-} f16x16_t;
-
-typedef struct f16x32_t {
-  uint16_t val[32];
-} f16x32_t;
-
-typedef struct f32x4_t {
-  float val[4];
-} f32x4_t;
-
-typedef struct f32x8_t {
-  float val[8];
-} f32x8_t;
-
-typedef struct f32x16_t {
-  float val[16];
-} f32x16_t;
-
-namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-};
-};  // namespace
-
-template <typename T, T count, typename F,
-          typename = std::enable_if_t<std::is_invocable_v<F, T> > >
-constexpr void unroll_loop(F&& f) {
-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-
-template <typename T>
-struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
-};
-
-struct FP32Vec8;
-struct FP32Vec16;
-
-struct FP16Vec8 : public Vec<FP16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  f16x8_t reg;
-
-  explicit FP16Vec8(const void* ptr)
-      : reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
-
-  explicit FP16Vec8(const FP32Vec8&);
-
-  void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
-};
-
-struct FP16Vec16 : public Vec<FP16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  f16x16_t reg;
-
-  explicit FP16Vec16(const void* ptr)
-      : reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
-
-  explicit FP16Vec16(const FP32Vec16&);
-
-  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
-
-  void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
-    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
-  }
-};
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  f16x8_t reg;
-
-  explicit BF16Vec8(const void* ptr)
-      : reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
-
-  explicit BF16Vec8(const FP32Vec8&);
-
-  void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  f16x16_t reg;
-
-  explicit BF16Vec16(const void* ptr)
-      : reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
-
-  explicit BF16Vec16(const FP32Vec16&);
-
-  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
-
-  void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
-    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
-  }
-};
-
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-  f16x32_t reg;
-
-  explicit BF16Vec32(const void* ptr)
-      : reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
-
-  explicit BF16Vec32(f16x32_t data) : reg(data) {};
-
-  explicit BF16Vec32(BF16Vec8& vec8_data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
-    }
-  }
-
-  void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
-};
-
-struct FP32Vec4 : public Vec<FP32Vec4> {
-  constexpr static int VEC_ELEM_NUM = 4;
-
-  f32x4_t reg;
-
-  explicit FP32Vec4(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
-  }
-
-  explicit FP32Vec4() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
-  }
-
-  explicit FP32Vec4(const float* ptr)
-      : reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
-
-  explicit FP32Vec4(f32x4_t data) : reg(data) {};
-
-  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
-};
-
-struct FP32Vec8 : public Vec<FP32Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  f32x8_t reg;
-
-  explicit FP32Vec8(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
-  }
-
-  explicit FP32Vec8() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
-  }
-
-  explicit FP32Vec8(const float* ptr)
-      : reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
-
-  explicit FP32Vec8(f32x8_t data) : reg(data) {};
-
-  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
-
-  explicit FP32Vec8(const FP16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
-  }
-
-  FP32Vec8(const BF16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
-  }
-
-  float reduce_sum() const {
-    float result = 0;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
-    return result;
-  }
-
-  FP32Vec8 exp() const {
-    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = expf(reg.val[i]);
-    }
-    return FP32Vec8(ret);
-  }
-
-  FP32Vec8 tanh() const {
-    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = tanhf(reg.val[i]);
-    }
-    return FP32Vec8(ret);
-  }
-
-  FP32Vec8 er() const {
-    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = erf(reg.val[i]);
-    }
-    return FP32Vec8(ret);
-  }
-
-  FP32Vec8 operator*(const FP32Vec8& b) const {
-    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] * b.reg.val[i];
-    }
-    return FP32Vec8(ret);
-  }
-
-  FP32Vec8 operator+(const FP32Vec8& b) const {
-    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] + b.reg.val[i];
-    }
-    return FP32Vec8(ret);
-  }
-
-  FP32Vec8 operator-(const FP32Vec8& b) const {
-    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] - b.reg.val[i];
-    }
-    return FP32Vec8(ret);
-  }
-
-  FP32Vec8 operator/(const FP32Vec8& b) const {
-    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] / b.reg.val[i];
-    }
-    return FP32Vec8(ret);
-  }
-
-  void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
-};
-
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  f32x16_t reg;
-
-  explicit FP32Vec16(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
-  }
-
-  explicit FP32Vec16() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
-  }
-
-  explicit FP32Vec16(const float* ptr)
-      : reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
-
-  explicit FP32Vec16(f32x16_t data) : reg(data) {};
-
-  FP32Vec16(const FP32Vec4& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
-    }
-  }
-
-  FP32Vec16(const FP32Vec8& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
-    }
-  }
-
-  FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
-
-  explicit FP32Vec16(const FP16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
-  }
-
-  explicit FP32Vec16(const BF16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
-  }
-
-  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
-
-  FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
-
-  FP32Vec16 operator*(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] * b.reg.val[i];
-    }
-    return result;
-  }
-
-  FP32Vec16 operator+(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] + b.reg.val[i];
-    }
-    return result;
-  }
-
-  FP32Vec16 operator-(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] - b.reg.val[i];
-    }
-    return result;
-  }
-
-  FP32Vec16 operator/(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] / b.reg.val[i];
-    }
-    return result;
-  }
-
-  FP32Vec16 max(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
-    }
-    return result;
-  }
-
-  FP32Vec16 min(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
-    }
-    return result;
-  }
-
-  FP32Vec16 abs() const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __abs(reg.val[i]);
-    }
-    return result;
-  }
-
-  float reduce_sum() const {
-    float result = 0.0f;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
-    return result;
-  }
-
-  float reduce_max() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __max(reg.val[i], result);
-    }
-    return result;
-  }
-
-  float reduce_min() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __min(reg.val[i], result);
-    }
-    return result;
-  }
-
-  template <int group_size>
-  float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    float sum = 0.0;
-    int start = idx * group_size;
-    int end = (idx + 1) * group_size;
-
-    for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
-      sum += reg.val[start];
-    }
-
-    return sum;
-  }
-
-  void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
-};
-
-template <typename T>
-struct VecType {
-  using vec_type = void;
-};
-
-template <typename T>
-using vec_t = typename VecType<T>::vec_type;
-
-template <>
-struct VecType<float> {
-  using vec_type = FP32Vec8;
-};
-
-template <>
-struct VecType<c10::Half> {
-  using vec_type = FP16Vec8;
-};
-
-template <>
-struct VecType<c10::BFloat16> {
-  using vec_type = BF16Vec8;
-};
-
-template <typename T>
-void storeFP32(float v, T* ptr) {
-  *ptr = v;
-}
-
-/*
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  c10::Half __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::Half *>(&v);
-  *ptr = *(v_ptr + 1);
-}
-*/
-
-template <>
-inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
-  uint16_t fp16 = float_to_fp16(v);
-  *reinterpret_cast<uint16_t*>(ptr) = fp16;
-}
-
-template <>
-inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
-  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
-      reinterpret_cast<c10::BFloat16*>(&v);
-  *ptr = *(v_ptr + 1);
-}
-
-inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
-}
-
-inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
-}
-
-inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
-  acc = acc + a * b;
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
-}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
-}
-
-inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
-
-};  // namespace vec_op
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@ -12,7 +12,7 @@ namespace vec_op {
 #define vec_sub(a, b) ((a) - (b))
 #define vec_mul(a, b) ((a) * (b))
 #define vec_div(a, b) ((a) / (b))
-#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebraic
+#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebaic
 #define vec_sl(a, b) ((a) << (b))  // Vector Shift Left

 // FIXME: FP16 is not fully supported in Torch-CPU
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -137,8 +137,9 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
 }

 void DNNLMatMulPrimitiveHandler::prepack_weight(
-    void* original_b_ptr, dnnl::memory::desc original_b_md,
-    dnnl::memory::desc b_target_mem_desc) {
+    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
+  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+                                   {b_k_stride_, b_n_stride_});
  dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
  dnnl::memory packed_weight(b_target_mem_desc, default_engine());
  {
@ -249,9 +250,7 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
    assert(!use_azp_);
  };
-  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
-                                   {b_k_stride_, b_n_stride_});
-  prepack_weight(args.b_ptr, original_b_md,
+  prepack_weight(args.b_ptr,
                 create_primitive_desc(
                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
                                   .use_bias = false,
@ -413,25 +412,12 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  assert(ab_type_ == dnnl::memory::data_type::f32 ||
         ab_type_ == dnnl::memory::data_type::bf16 ||
         ab_type_ == dnnl::memory::data_type::f16);
-
-  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
-                                   {b_k_stride_, b_n_stride_});
-
-  prepack_weight(args.b_ptr, original_b_md,
+  prepack_weight(args.b_ptr,
                 create_primitive_desc(
-                     MSizeCacheKey{
-#ifdef VLLM_USE_ACL
-                         // Arm Compute Library (ACL) backend for oneDNN does
-                         // not support runtime
-                         // dimensions, so we set M to a default value
-                         .a_m_size = 128,
-                         .a_m_stride = b_k_size_,
-#else
-                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
-                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
-#endif
-                         .use_bias = false,
-                         .bias_type = dnnl::memory::data_type::undef},
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+                                   .use_bias = false,
+                                   .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
@ -457,30 +443,12 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
  c_storage->set_data_handle((void*)args.c_ptr);
  c_mem_desc->dims[0] = args.a_m_size;

-#ifndef VLLM_USE_ACL
-  // We do not support in ACL backend of oneDNN, we handle bias by:
-  // 1. copying it into the result tensor
-  // 2. attaching a fused-sum post-op to the matmul primitive
  if (args.use_bias) {
    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
    bias_storage->set_data_handle((void*)args.bias_ptr);
  }
-#endif
-  dnnl::matmul matmul = get_matmul_cache(args);

-// With ACL backend of oneDNN, the required memory format might change when the
-// source tensor dims change. This does not really happen in practice, so isn't
-// a performance hit, but we need to support it because the API allows for it.
-#ifdef VLLM_USE_ACL
-  auto new_expected_wei_desc =
-      dnnl::matmul::primitive_desc(
-          const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
-          .weights_desc();
-  if (new_expected_wei_desc != b_target_mem_desc_) {
-    prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
-                   b_target_mem_desc_, new_expected_wei_desc);
-  }
-#endif
+  dnnl::matmul matmul = get_matmul_cache(args);

  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
  scratchpad_storage->set_data_handle(
@ -516,13 +484,7 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
  } else {
    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
                              {key.a_m_stride, 1});
-#ifdef VLLM_USE_ACL
-    // ACL's backend of oneDNN always expects the weight format to be "any"
-    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
-                              dnnl::memory::format_tag::any);
-#else
    b_md = b_target_mem_desc_;
-#endif
  }
  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
                          dnnl::memory::format_tag::ab);
@ -532,18 +494,8 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(

  if (key.use_bias) {
    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
-// Since ACL's matmuls don't support passing a bias_md, we apply the bias
-// through a fused-sum post-op
-#ifdef VLLM_USE_ACL
-    dnnl::post_ops post_ops;
-    post_ops.append_sum();
-    attr.set_post_ops(post_ops);
-    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
-                                        attr);
-#else
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
                                        c_md, attr);
-#endif
  } else {
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
@ -559,23 +511,13 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
                   default_engine(), nullptr);
  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());

-// ACL matmuls don't support bias_md, so we don't need these
-#ifndef VLLM_USE_ACL
  memory_cache_[DNNL_ARG_BIAS] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
-#endif
+
  memory_cache_[DNNL_ARG_SCRATCHPAD] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
-
-bool is_onednn_acl_supported() {
-#ifdef VLLM_USE_ACL
-  return true;
-#else
-  return false;
-#endif
-}
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
 protected:
  DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);

-  void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
+  void prepack_weight(void* original_b_ptr,
                      dnnl::memory::desc b_target_mem_desc);

  void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -523,46 +523,25 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
  CPU_KERNEL_GUARD_IN(onednn_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.stride(-1) == 1);
-  TORCH_CHECK(c.stride(-1) == 1);
+  TORCH_CHECK(c.is_contiguous());
  MatMulPrimitiveHandler* ptr =
      reinterpret_cast<MatMulPrimitiveHandler*>(handler);

-// ACL matmuls expect contiguous source tensors
-#ifdef VLLM_USE_ACL
-  torch::Tensor a_contig = a.contiguous();
-#endif
-
  MatMulPrimitiveHandler::ExecArgs exec_args;
-
-#ifdef VLLM_USE_ACL
-  exec_args.a_m_size = a_contig.size(0);
-  exec_args.a_m_stride = a_contig.stride(0);
-#else
  exec_args.a_m_size = a.size(0);
  exec_args.a_m_stride = a.stride(0);
-#endif
+
  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
    if (bias.has_value()) {
      exec_args.use_bias = true;
      exec_args.bias_type = get_dnnl_type<scalar_t>();
-#ifdef VLLM_USE_ACL
-      // ACL matmuls in oneDNN do not support a bias.
-      // We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
-      c.copy_(bias.value());
-#else
      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
-#endif
    } else {
      exec_args.use_bias = false;
      exec_args.bias_type = get_dnnl_type<void>();
      exec_args.bias_ptr = nullptr;
    }
-#ifdef VLLM_USE_ACL
-    exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
-#else
    exec_args.a_ptr = a.data_ptr<scalar_t>();
-
-#endif
    exec_args.c_ptr = c.data_ptr<scalar_t>();

    ptr->execute(exec_args);
--- a/csrc/cpu/float_convert.hpp
+++ b/csrc/cpu/float_convert.hpp
@ -1,106 +0,0 @@
-
-static float bf16_to_float(uint16_t bf16) {
-  uint32_t bits = static_cast<uint32_t>(bf16) << 16;
-  float fp32;
-  std::memcpy(&fp32, &bits, sizeof(fp32));
-  return fp32;
-}
-
-static uint16_t float_to_bf16(float fp32) {
-  uint32_t bits;
-  std::memcpy(&bits, &fp32, sizeof(fp32));
-  return static_cast<uint16_t>(bits >> 16);
-}
-
-/************************************************
- * Copyright (c) 2015 Princeton Vision Group
- * Licensed under the MIT license.
- * Codes below copied from
- * https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
- *************************************************/
-static uint16_t float_to_fp16(float fp32) {
-  uint16_t fp16;
-
-  unsigned x;
-  unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
-  unsigned sign, exponent, mantissa;
-
-  std::memcpy(&x, &fp32, sizeof(fp32));
-  u = (x & 0x7fffffff);
-
-  // Get rid of +NaN/-NaN case first.
-  if (u > 0x7f800000) {
-    fp16 = 0x7fffU;
-    return fp16;
-  }
-
-  sign = ((x >> 16) & 0x8000);
-
-  // Get rid of +Inf/-Inf, +0/-0.
-  if (u > 0x477fefff) {
-    fp16 = sign | 0x7c00U;
-    return fp16;
-  }
-  if (u < 0x33000001) {
-    fp16 = (sign | 0x0000);
-    return fp16;
-  }
-
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
-
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
-  } else {
-    shift = 0x7e - exponent;
-    exponent = 0;
-    mantissa |= 0x800000;
-  }
-  lsb = (1 << shift);
-  lsb_s1 = (lsb >> 1);
-  lsb_m1 = (lsb - 1);
-
-  // Round to nearest even.
-  remainder = (mantissa & lsb_m1);
-  mantissa >>= shift;
-  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-    ++mantissa;
-    if (!(mantissa & 0x3ff)) {
-      ++exponent;
-      mantissa = 0;
-    }
-  }
-
-  fp16 = (sign | (exponent << 10) | mantissa);
-
-  return fp16;
-}
-
-static float fp16_to_float(uint16_t fp16) {
-  unsigned sign = ((fp16 >> 15) & 1);
-  unsigned exponent = ((fp16 >> 10) & 0x1f);
-  unsigned mantissa = ((fp16 & 0x3ff) << 13);
-  int temp;
-  float fp32;
-  if (exponent == 0x1f) { /* NaN or Inf */
-    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-    exponent = 0xff;
-  } else if (!exponent) { /* Denorm or Zero */
-    if (mantissa) {
-      unsigned int msb;
-      exponent = 0x71;
-      do {
-        msb = (mantissa & 0x400000);
-        mantissa <<= 1; /* normalize */
-        --exponent;
-      } while (!msb);
-      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
-    }
-  } else {
-    exponent += 0x70;
-  }
-  temp = ((sign << 31) | (exponent << 23) | mantissa);
-  std::memcpy(&fp32, &temp, sizeof(temp));
-  return fp32;
-}
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@ -215,7 +215,7 @@ int moe_align_block_size(
      offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
    }
  });
-  // TODO: do we need to vectorize this ?
+  // TODO: do we need to vecterize this ?
  for (int mb = 0; mb < num_token_blocks; ++mb) {
    offsets[mb + 1] += offsets[mb];
  }
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -27,8 +27,6 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
               const std::optional<torch::Tensor>& bias, int64_t handler);

-bool is_onednn_acl_supported();
-
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
@ -90,18 +88,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    int tp_rank, int blocksparse_local_blocks,"
      "    int blocksparse_vert_stride, int blocksparse_block_size,"
      "    int blocksparse_head_sliding_step) -> ()");
-
  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);

-  ops.def(
-      "dynamic_4bit_int_moe("
-      "Tensor x, Tensor topk_ids, Tensor topk_weights,"
-      "Tensor w13_packed, Tensor w2_packed, int H, int I, int I2,"
-      "int group_size, bool apply_router_weight_on_input, int activation_kind"
-      ") -> Tensor");
-
-  ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
-
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
@ -183,9 +171,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "int handler) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);

-  // Check if oneDNN was built with ACL backend
-  ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
-
  // Create oneDNN W8A8 handler
  ops.def(
      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
--- a/csrc/cub_helpers.h
+++ b/csrc/cub_helpers.h
@ -1,17 +0,0 @@
-#pragma once
-
-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-  #if CUB_VERSION >= 200800
-    #include <cuda/std/functional>
-using CubAddOp = cuda::std::plus<>;
-using CubMaxOp = cuda::maximum<>;
-  #else   // if CUB_VERSION < 200800
-using CubAddOp = cub::Sum;
-using CubMaxOp = cub::Max;
-  #endif  // CUB_VERSION
-#else
-  #include <hipcub/hipcub.hpp>
-using CubAddOp = cub::Sum;
-using CubMaxOp = cub::Max;
-#endif  // USE_ROCM
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -27,7 +27,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    **{
        VLLMDataType.u4b8: "u4b8",
        VLLMDataType.u8b128: "u8b128",
-    },
+    }
 }

 VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -35,7 +35,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
        VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
-    },
+    }
 }

 VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
@ -43,7 +43,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    **{
        VLLMDataType.u4b8: 4,
        VLLMDataType.u8b128: 8,
-    },
+    }
 }

 VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -67,13 +67,15 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.f32: "at::ScalarType::Float",
 }

-VLLMKernelScheduleTag: dict[
-    Union[MixedInputKernelScheduleType, KernelScheduleType], str
-] = {
-    **KernelScheduleTag,  # type: ignore
-    **{
-        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
-        MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",  # noqa: E501
-        MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",  # noqa: E501
-    },
-}
+VLLMKernelScheduleTag: dict[Union[
+    MixedInputKernelScheduleType, KernelScheduleType], str] = {
+        **KernelScheduleTag,  # type: ignore
+        **{
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
+            "cutlass::gemm::KernelTmaWarpSpecialized",
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
+        }
+    }
--- a/csrc/launch_bounds_utils.h
+++ b/csrc/launch_bounds_utils.h
@ -1,64 +0,0 @@
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include <algorithm>
-
-// maximum blocks per SM cap
-#ifndef VLLM_LAUNCH_BLOCKS_CAP
-  #define VLLM_LAUNCH_BLOCKS_CAP 4
-#endif
-
-// Compile-time estimate of max threads per SM for launch bounds.
-// Families: 1024, 1536, 2048 threads/SM.
-#ifndef VLLM_MAX_THREADS_PER_SM
-  #ifdef __CUDA_ARCH__
-
-    /* 1024 thr/SM: Turing (sm_75) */
-    #if (__CUDA_ARCH__ == 750)
-      #define VLLM_MAX_THREADS_PER_SM 1024
-
-    /* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
-        GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
-    #elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
-        (__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) ||  \
-        (__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
-        (__CUDA_ARCH__ == 1210)
-      #define VLLM_MAX_THREADS_PER_SM 1536
-
-    /* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
-        Hopper (sm_90), Blackwell (sm_100/103) */
-    #elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
-        (__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) ||   \
-        (__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
-      #define VLLM_MAX_THREADS_PER_SM 2048
-
-    /* Fallback: use 2048 for unknown future CCs */
-    #else
-      #define VLLM_MAX_THREADS_PER_SM 2048
-    #endif
-
-  #else
-  /* Host pass (no __CUDA_ARCH__): neutral default */
-    #define VLLM_MAX_THREADS_PER_SM 2048
-  #endif
-#endif
-
-// compute the number of blocks per SM to request in __launch_bounds__
-#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
-#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
-  (((VAL) <= 0)                       \
-       ? 1                            \
-       : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
-#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
-  VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
-
-// runtime-time helper to compute blocks/SM
-static inline int vllm_runtime_blocks_per_sm(int block_threads) {
-  int device = -1;
-  cudaGetDevice(&device);
-  int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
-  cudaDeviceGetAttribute(&max_threads_per_sm,
-                         cudaDevAttrMaxThreadsPerMultiProcessor, device);
-  int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
-  return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
-}
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -1,11 +1,15 @@
 #include "type_convert.cuh"
 #include "dispatch_utils.h"
-#include "cub_helpers.h"
-#include "core/batch_invariant.hpp"

 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>

+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
 namespace vllm {

 // TODO(woosuk): Further optimize this kernel.
@ -26,7 +30,7 @@ __global__ void rms_norm_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -81,7 +85,7 @@ fused_add_rms_norm_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -122,7 +126,7 @@ fused_add_rms_norm_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -414,9 +418,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
-  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
-      !batch_invariant_launch) {
+  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
@ -462,8 +464,7 @@ void poly_norm(torch::Tensor& out,     // [..., hidden_size]
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
-  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
    LAUNCH_FUSED_POLY_NORM(0);
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -8,12 +8,16 @@
 #include "type_convert.cuh"
 #include "quantization/fp8/common.cuh"
 #include "dispatch_utils.h"
-#include "cub_helpers.h"
-#include "core/batch_invariant.hpp"

 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>

+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
 namespace vllm {

 // TODO(woosuk): Further optimize this kernel.
@ -35,7 +39,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -96,7 +100,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -145,7 +149,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -241,9 +245,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
-  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
-      !batch_invariant_launch) {
+  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@ -1,156 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/Parallel.h>
-#include <torch/all.h>
-
-// _dyn_quant_matmul_4bit is only available on AArch64.
-#if defined(__aarch64__)
-  #include <ATen/ops/_dyn_quant_matmul_4bit.h>
-#endif
-
-inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w,
-                        int64_t group_size_eff, int64_t in_features,
-                        int64_t out_features) {
-#if defined(__aarch64__)
-  return at::_ops::_dyn_quant_matmul_4bit::call(a, packed_w, group_size_eff,
-                                                in_features, out_features);
-#else
-  TORCH_CHECK(false,
-              "dynamic 4-bit int MoE path requires AArch64 (ARM64); "
-              "_dyn_quant_matmul_4bit is unavailable on this architecture");
-  return {};
-#endif
-}
-
-enum ActivationKind : int64_t {
-  SwiGLU_Gu = 0,  // act = SiLU(g) * u
-  SwiGLUOAI = 1,  // act = SiLU(u) * g
-  SiLU = 2        // SiLU
-};
-
-torch::Tensor dynamic_4bit_int_moe_cpu(
-    torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
-    torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
-    int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
-    int64_t activation_kind) {
-  TORCH_CHECK(x.dim() == 2, "x must be 2D");
-  TORCH_CHECK(topk_ids.dim() == 2 && topk_weights.dim() == 2,
-              "topk tensors must be [T, K]");
-  TORCH_CHECK(
-      w13_packed.size(0) == w2_packed.size(0),
-      "w13_packed and w2_packed must have same number of experts in dim 0");
-  TORCH_CHECK(I2 == 2 * I, "I2 must equal 2*I");
-
-  const int64_t T = x.size(0);
-  const int64_t K = topk_ids.size(1);
-  const int64_t E = w13_packed.size(0);
-  const int64_t N = T * K;
-
-  auto x_c = x.contiguous();
-  auto ids_c = topk_ids.contiguous();
-  auto gates_c = topk_weights.to(at::kFloat).contiguous();
-
-  // bucketing tokens -> experts
-  c10::SmallVector<int64_t, 64> counts(
-      E, 0);  // Small vector uses stack allocation
-  {
-    const auto* ids_ptr = ids_c.data_ptr<int64_t>();
-    for (int64_t i = 0; i < N; ++i) {
-      const int64_t e_id = ids_ptr[i];
-      TORCH_CHECK(0 <= e_id && e_id < E, "expert id out of range");
-      counts[e_id]++;
-    }
-  }
-  c10::SmallVector<int64_t, 65> offsets(E + 1, 0);  // ( E +1 )
-  for (int64_t e = 0; e < E; ++e) offsets[e + 1] = offsets[e] + counts[e];
-
-  auto expert_tokens = at::empty({offsets[E]}, ids_c.options());
-  auto expert_gates = at::empty({offsets[E]}, gates_c.options());
-  {
-    c10::SmallVector<int64_t, 64> cursor(E, 0);
-    const auto* ids_ptr = ids_c.data_ptr<int64_t>();
-    const auto* gts_ptr = gates_c.data_ptr<float>();
-    auto* tok_ptr = expert_tokens.data_ptr<int64_t>();
-    auto* gate_ptr = expert_gates.data_ptr<float>();
-
-    for (int64_t t = 0; t < T; ++t) {
-      const int64_t base = t * K;
-      for (int64_t k = 0; k < K; ++k) {
-        const int64_t idx = base + k;
-        const int64_t e = ids_ptr[idx];
-        const int64_t p = offsets[e] + (cursor[e]++);
-        tok_ptr[p] = t;
-        gate_ptr[p] = gts_ptr[idx];
-      }
-    }
-  }
-
-  const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
-  const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
-
-  // Per-expert outputs filled in parallel
-  std::vector<torch::Tensor> y_list(E);
-  y_list.resize(E);
-
-  at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
-    for (int64_t e = e_begin; e < e_end; ++e) {
-      const int64_t te = counts[e];
-      if (te == 0) {
-        y_list[e] = at::empty({0, H}, x_c.options());
-        continue;
-      }
-
-      const int64_t start = offsets[e];
-
-      auto sel_tokens =
-          expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-      auto gates_e =
-          expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-
-      auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
-
-      if (apply_router_weight_on_input) {
-        x_e = x_e.mul(gates_e.unsqueeze(1));
-      }
-
-      auto w13_e = w13_packed.select(/*dim=*/0, e);
-      auto w2_e = w2_packed.select(/*dim=*/0, e);
-
-      // W13
-      auto y13 =
-          mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2);
-
-      auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I);
-      auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I);
-
-      torch::Tensor act;
-      if (activation_kind == ActivationKind::SwiGLUOAI) {  // SwiGLUOAI
-        constexpr double kAlpha = 1.702;                   // GPT-OSS default
-        constexpr double kLimit = 7.0;                     // GPT-OSS default
-        auto gate_c = at::clamp_max(g_part, kLimit);
-        auto up_c = at::clamp(u_part, -kLimit, kLimit);
-        auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha)));
-        act = up_c.add(1.0).mul(glu);
-      } else {  // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul()
-        act = at::silu(g_part).mul(u_part);
-      }
-
-      // W2
-      auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
-
-      if (!apply_router_weight_on_input) {
-        y = y.mul(gates_e.unsqueeze(1));
-      }
-
-      // Store per-expert result
-      y_list[e] = y;
-    }
-  });
-
-  // Concatenate all expert outputs to match expert_tokens order
-  auto Y_all = at::cat(y_list, /*dim=*/0);
-  auto out = at::zeros({T, H}, x.options());
-  out =
-      at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
-
-  return out;
-}
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -21,7 +21,6 @@
 #include <torch/all.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
-#include <cuda/std/limits>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
@ -29,6 +28,7 @@ namespace cg = cooperative_groups;
 namespace vllm {
 namespace moe {

+constexpr float kNegInfinity = INFINITY * -1;
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
@ -411,30 +411,14 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
  return __bfloat162float(val);
 }

-template <typename T>
-__device__ inline T neg_inf() {
-  // cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
-  // so we need to cast from fp32
-  return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
-}
-
-template <typename T>
-__device__ inline bool is_finite(const T val) {
-#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
-  return cuda::std::isfinite(val);
-#else
-  return isfinite(cuda_cast<float, T>(val));
-#endif
-}
-
 template <typename T>
 __device__ void topk_with_k2(T* output, T const* input,
                             cg::thread_block_tile<32> const& tile,
                             int32_t const lane_id,
                             int const num_experts_per_group) {
  // Get the top2 per thread
-  T largest = neg_inf<T>();
-  T second_largest = neg_inf<T>();
+  T largest = -INFINITY;
+  T second_largest = -INFINITY;

  if (num_experts_per_group > WARP_SIZE) {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@ -529,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
      warp_id * topk;
  s_topk_idx += warp_id * topk;

-  T value = neg_inf<T>();
-  T topk_group_value = neg_inf<T>();
+  T value = kNegInfinity;
+  T topk_group_value = kNegInfinity;
  int32_t num_equalto_topkth_group;

 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@ -541,8 +525,11 @@ __global__ void group_idx_and_topk_idx_kernel(
  if (case_id < num_tokens) {
    // calculate group_idx
    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
-    // The check is necessary to avoid abnormal input
-    if (lane_id < n_group && is_finite(group_scores[lane_id])) {
+    if (lane_id < n_group &&
+        (isfinite(cuda_cast<float, T>(
+            group_scores[lane_id]))))  // The check is necessary to avoid
+                                       // abnormal input
+    {
      value = group_scores[lane_id];
    }

@ -553,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
      __syncwarp();  // Ensure all threads have valid data before reduction
      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
      if (value == topk_group_value) {
-        value = neg_inf<T>();
+        value = kNegInfinity;
      }
      pre_count_equal_to_top_value = count_equal_to_top_value;
-      count_equal_to_top_value =
-          __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
+      count_equal_to_top_value = __popc(__ballot_sync(
+          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
    }
    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
  }
@ -565,10 +552,11 @@ __global__ void group_idx_and_topk_idx_kernel(

  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
-      queue((int32_t)topk, neg_inf<T>());
+      queue((int32_t)topk, -INFINITY);

  int count_equalto_topkth_group = 0;
-  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
+  bool if_proceed_next_topk =
+      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
  if (case_id < num_tokens && if_proceed_next_topk) {
    for (int i_group = 0; i_group < n_group; i_group++) {
      if ((group_scores[i_group] > topk_group_value) ||
@ -577,10 +565,11 @@ __global__ void group_idx_and_topk_idx_kernel(
        int32_t offset = i_group * num_experts_per_group;
        for (int32_t i = lane_id; i < align_num_experts_per_group;
             i += WARP_SIZE) {
-          T candidates = (i < num_experts_per_group) &&
-                                 is_finite(scores_with_bias[offset + i])
-                             ? scores_with_bias[offset + i]
-                             : neg_inf<T>();
+          T candidates =
+              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
+                                                 scores_with_bias[offset + i]))
+                  ? scores_with_bias[offset + i]
+                  : cuda_cast<T, float>(kNegInfinity);
          queue.add(candidates, offset + i);
        }
        if (group_scores[i_group] == topk_group_value) {
@ -609,8 +598,7 @@ __global__ void group_idx_and_topk_idx_kernel(
      if (i < topk) {
        s_topk_value[i] = value;
      }
-      topk_sum +=
-          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
    }
  }

--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -17,30 +17,25 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()

-TEMPLATE = (
-    "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
-    "{{s_type_id}}, "
-    "{{threads}}, "
-    "{{thread_m_blocks}}, "
-    "{{thread_n_blocks}}, "
-    "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
-    "{{stages}}, "
-    "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
-    "( MARLIN_KERNEL_PARAMS );"
-)
+TEMPLATE = ("template __global__ void Marlin<"
+            "{{scalar_t}}, "
+            "{{w_type_id}}, "
+            "{{s_type_id}}, "
+            "{{threads}}, "
+            "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
+            "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")

 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
+    "vllm::kFE2M1f"
 ]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]

@ -63,12 +58,11 @@ def generate_new_kernels():
        all_template_str_list = []

        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
+
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
+                    "vllm::kU4B8", "vllm::kU8B128"
            ]:
                continue
            if thread_configs[2] == 256:
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -44,9 +44,6 @@ __global__ void moe_align_block_size_kernel(

  for (size_t i = tid; i < numel; i += stride) {
    int expert_id = topk_ids[i];
-    if (expert_id >= num_experts) {
-      continue;
-    }
    int warp_idx = expert_id / experts_per_warp;
    int expert_offset = expert_id % experts_per_warp;
    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@ -98,15 +95,12 @@ template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    size_t numel, int32_t num_experts) {
+    size_t numel) {
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t stride = blockDim.x * gridDim.x;

  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
-    if (expert_id >= num_experts) {
-      continue;
-    }
    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
    sorted_token_ids[rank_post_pad] = i;
  }
@ -275,7 +269,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
-              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
+              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
        }
      });
 }
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -20,8 +20,17 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
-#include "../cub_helpers.h"
-#include "../core/batch_invariant.hpp"
+
+#ifndef USE_ROCM
+    #include <cub/util_type.cuh>
+    #include <cub/cub.cuh>
+    #include <cuda/std/functional>
+    using AddOp = cuda::std::plus<float>;
+#else
+    #include <hipcub/util_type.hpp>
+    #include <hipcub/hipcub.hpp>
+    using AddOp = cub::Sum; 
+#endif

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -70,7 +79,7 @@ __launch_bounds__(TPB) __global__
        threadData = max(static_cast<float>(input[idx]), threadData);
    }

-    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
+    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
    if (threadIdx.x == 0)
    {
        float_max = maxElem;
@ -85,7 +94,7 @@ __launch_bounds__(TPB) __global__
        threadData += exp((static_cast<float>(input[idx]) - float_max));
    }

-    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
+    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());

    if (threadIdx.x == 0)
    {
@ -406,8 +415,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
-    const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
-    const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;

    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -122,6 +122,12 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      std::optional<torch::Tensor> key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);

+void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
+                              std::optional<torch::Tensor> key,
+                              int64_t head_size, torch::Tensor& cos_sin_cache,
+                              bool is_neox, int64_t rot_dim,
+                              torch::Tensor& cos_sin_cache_offsets);
+
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
@ -133,12 +139,6 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& input,
                              torch::Tensor& input_global_scale);
 #endif
-void silu_mul_fp8_quant_deep_gemm_cuda(
-    const at::Tensor& input,   // (E, T, 2*H)
-    const at::Tensor& counts,  // (E)
-    at::Tensor& y_q,           // (E, T, H) [OUT]
-    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);

 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

@ -328,12 +328,6 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const std::optional<torch::Tensor>& has_initial_state,
                        const torch::Tensor& ssm_states, int64_t pad_slot_id);

-torch::Tensor dynamic_4bit_int_moe_cpu(
-    torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
-    torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
-    int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
-    int64_t activation_kind);
-
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
@ -353,8 +347,6 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
 int64_t open_mem_handle(torch::Tensor& mem_handle);
 void free_shared_buffer(int64_t buffer);

-torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace);
-
 #ifdef USE_ROCM
 fptr_t init_custom_qr(int64_t rank, int64_t world_size,
                      std::optional<int64_t> qr_max_size = std::nullopt);
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@ -99,6 +99,35 @@ __global__ void rotary_embedding_kernel(
      token_idx, query_stride, key_stride, head_stride);
 }

+template <typename scalar_t, bool IS_NEOX>
+__global__ void batched_rotary_embedding_kernel(
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
+                                   // head_size] or [num_tokens, num_heads,
+                                   // head_size]
+    scalar_t* __restrict__ key,  // nullptr or
+                                 // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
+    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int64_t head_stride, const int num_heads, const int num_kv_heads,
+    const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
+  const scalar_t* cache_ptr =
+      cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
+
+  apply_rotary_embedding<scalar_t, IS_NEOX>(
+      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
+      token_idx, query_stride, key_stride, head_stride);
+}
+
 }  // namespace vllm

 void rotary_embedding(
@ -182,3 +211,96 @@ void rotary_embedding(
    }
  });
 }
+
+/*
+Batched version of rotary embedding, pack multiple LoRAs together
+and process in batched manner.
+*/
+void batched_rotary_embedding(
+    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
+    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
+                           // [num_tokens, num_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
+    std::optional<torch::Tensor>
+        key,  // null or
+              // [batch_size, seq_len, num_kv_heads * head_size] or
+              // [num_tokens, num_kv_heads * head_size] or
+              // [batch_size, seq_len, num_heads, head_size] or
+              // [num_tokens, num_heads, head_size]
+    int64_t head_size,
+    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
+    bool is_neox, int64_t rot_dim,
+    torch::Tensor& cos_sin_cache_offsets  // [num_tokens] or [batch_size]
+) {
+  // num_tokens = batch_size * seq_len
+  int64_t num_tokens = cos_sin_cache_offsets.size(0);
+  TORCH_CHECK(
+      positions.size(0) == num_tokens || positions.numel() == num_tokens,
+      "positions must have the same num_tokens or batch_size as "
+      "cos_sin_cache_offsets");
+
+  int positions_ndim = positions.dim();
+  // Make sure num_tokens dim is consistent across positions, query, and key
+  TORCH_CHECK(
+      positions_ndim == 1 || positions_ndim == 2,
+      "positions must have shape [num_tokens] or [batch_size, seq_len]");
+  if (positions_ndim == 1) {
+    TORCH_CHECK(query.size(0) == positions.size(0) &&
+                    (!key.has_value() || key->size(0) == positions.size(0)),
+                "query, key and positions must have the same number of tokens");
+  }
+  if (positions_ndim == 2) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) &&
+            (!key.has_value() || key->size(0) == positions.size(0)) &&
+            query.size(1) == positions.size(1) &&
+            (!key.has_value() || key->size(1) == positions.size(1)),
+        "query, key and positions must have the same batch_size and seq_len");
+  }
+
+  // Make sure head_size is valid for query and key
+  int query_hidden_size = query.numel() / num_tokens;
+  int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0;
+  TORCH_CHECK(query_hidden_size % head_size == 0);
+  TORCH_CHECK(key_hidden_size % head_size == 0);
+
+  // Make sure query and key have concistent number of heads
+  int num_heads = query_hidden_size / head_size;
+  int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads;
+  TORCH_CHECK(num_heads % num_kv_heads == 0);
+
+  int seq_dim_idx = positions_ndim - 1;
+  int64_t query_stride = query.stride(seq_dim_idx);
+  int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
+  // Determine head stride: for [*, heads, head_size] use stride of last dim;
+  // for flat [*, heads*head_size], heads blocks are contiguous of size
+  // head_size
+  int query_ndim = query.dim();
+  int64_t head_stride =
+      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
+    if (is_neox) {
+      vllm::batched_rotary_embedding_kernel<scalar_t, true>
+          <<<grid, block, 0, stream>>>(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(),
+              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
+    } else {
+      vllm::batched_rotary_embedding_kernel<scalar_t, false>
+          <<<grid, block, 0, stream>>>(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(),
+              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
+    }
+  });
+}
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -9,31 +9,6 @@

 #include "quantization/fp8/common.cuh"

-#include <c10/util/Float8_e4m3fn.h>
-
-#ifndef USE_ROCM
-  #include <cuda_bf16.h>
-  #include <cuda_fp16.h>
-  #include <cuda_fp8.h>
-#else
-  #include <hip/hip_bf16.h>
-  #include <hip/hip_fp16.h>
-  #include <hip/hip_fp8.h>
-
-typedef __hip_bfloat162 __nv_bfloat162;
-typedef __hip_bfloat16 __nv_bfloat16;
-typedef __hip_bfloat16_raw __nv_bfloat16_raw;
-  #if defined(HIP_FP8_TYPE_OCP)
-typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
-typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
-  #else
-// ROCm 6.2 fallback: only *_fnuz types exist
-typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
-typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
-  #endif
-#endif
-
-#include "core/registration.h"
 namespace vllm {

 template <typename T>
@ -112,336 +87,6 @@ __global__ void act_and_mul_quant_kernel(
    }
  }
 }
-
-__device__ __forceinline__ float silu(float x) {
-  return (__fdividef(x, (1.f + expf(-x))));
-}
-
-__device__ __forceinline__ float2 silu2(float2 x) {
-  return make_float2(silu(x.x), silu(x.y));
-}
-
-#ifndef USE_ROCM
-__device__ __forceinline__ float warp_max(float v) {
-  static constexpr unsigned FULL_MASK = 0xffffffffu;
-  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
-    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, offset));
-  }
-  return v;
-}
-
-__device__ __forceinline__ __nv_bfloat16 warp_max(__nv_bfloat16 v) {
-  static constexpr unsigned FULL_MASK = 0xffffffffu;
-  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
-    v = __hmax(v, __shfl_xor_sync(FULL_MASK, v, offset));
-  }
-  return v;
-}
-#endif
-
-template <typename T, typename U>
-__device__ __forceinline__ void cp_async4(T* _smem_ptr, const U* _glob_ptr) {
-#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
-  auto smem_ptr = reinterpret_cast<void*>(_smem_ptr);
-  auto glob_ptr = reinterpret_cast<const void*>(_glob_ptr);
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-#else
-  _smem_ptr[0] = _glob_ptr[0];
-#endif
-}
-
-__device__ __forceinline__ void cp_async_fence() {
-#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
-  asm volatile("cp.async.commit_group;\n" ::);
-#else
-#endif
-}
-
-template <int N>
-__device__ __forceinline__ void cp_async_wait() {
-#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
-#else
-#endif
-}
-
-template <>
-__device__ __forceinline__ void cp_async_wait<0>() {
-#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
-  asm volatile("cp.async.wait_all;\n" ::);
-#else
-#endif
-}
-
-__device__ __forceinline__ float clip(float v, float mmin, float mmax) {
-#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
-  return fminf(mmax, fmaxf(v, mmin));
-#else
-#endif
-}
-
-__device__ __forceinline__ __nv_bfloat16 clip(__nv_bfloat16 v,
-                                              __nv_bfloat16 mmin,
-                                              __nv_bfloat16 mmax) {
-  return __hmin(mmax, __hmax(v, mmin));
-}
-
-__device__ __forceinline__ __nv_bfloat162 clip(__nv_bfloat162 v,
-                                               __nv_bfloat162 mmin,
-                                               __nv_bfloat162 mmax) {
-  return __hmin2(mmax, __hmax2(v, mmin));
-}
-
-// We use the following values for fp8 min/max:
-//  __nv_fp8_e4m3 = (-448, +448)
-//  __nv_fp8_e4m3uz = (-240.0, +240.0)
-// It is currently assumed that only
-template <class T>
-constexpr __nv_bfloat16 get_fp8_max() {
-  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
-                std::is_same_v<T, c10::Float8_e4m3fnuz>);
-  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
-    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17376});
-  } else {
-    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17264});
-  }
-}
-
-template <class T>
-constexpr __nv_bfloat16 get_fp8_min() {
-  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
-                std::is_same_v<T, c10::Float8_e4m3fnuz>);
-  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
-    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50144});
-  } else {
-    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
-  }
-}
-#ifndef USE_ROCM
-template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
-          int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
-          int NUM_STAGES = 3>
-__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
-    const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ counts,
-
-    // sizes
-    int H, int G,
-
-    // strides (in elements)
-    Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
-    Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
-    Idx_t stride_ys_g, Idx_t stride_counts_e) {
-  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
-  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
-  // We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
-  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
-
-  // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
-  static constexpr int32_t BFLOAT16_PER_GROUP = 8;
-
-  // We split the shared memory in half, corresponding to gate and up matrices:
-  // [...gate_i, ...up_i]  where 0 <= i < stages.
-  static constexpr int32_t S_NUM_128 =
-      2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
-  static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
-  static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
-  static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
-  __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
-
-  const int32_t tid = threadIdx.x;
-  const int32_t warp_id = tid / WARP_SIZE;
-  const int32_t lane_id = tid % WARP_SIZE;
-
-  auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
-
-  // block handles one (expert e, group g)
-  int32_t pid = blockIdx.x;
-  int32_t e = pid / G;
-  int32_t g = pid % G;
-
-  const int32_t n_tokens = counts[e * stride_counts_e];
-
-  if (!n_tokens) {
-    return;  // Exit ASAP.
-  }
-
-  const Idx_t stride_i_t_128 = stride_i_t / 8u;
-
-  int32_t n_tokens_lower, n_tokens_upper;
-
-  // Each block i iterates over tokens of a slice of n_tokens =
-  // expert_counts[i], with the size of chunk being
-  // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
-  // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
-  if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
-    // Specialize this, but can be likely fused.
-    if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
-      return;
-    }
-    n_tokens_lower = blockIdx.y;
-    n_tokens_upper = blockIdx.y + 1;
-  } else {
-    auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
-    auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
-    auto calc_id = [&](int32_t id) {
-      if (id < residual) {
-        return min(n_tokens, id * (chunk_size + 1));
-      } else {
-        return min(n_tokens, id * chunk_size + residual);
-      }
-    };
-    n_tokens_lower = calc_id(blockIdx.y);
-    n_tokens_upper = calc_id(blockIdx.y + 1);
-  }
-
-  if (n_tokens_lower >= n_tokens_upper) {
-    return;
-  }
-
-  // We do calculations here, using constexpr wherever possible.
-  const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
-  const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
-  const Idx_t base_yq =
-      e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
-  Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
-  auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
-  auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
-                      stride_i_t_128 * n_tokens_lower;
-  auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
-  auto y_s_ptr =
-      _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
-  auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
-                 stride_yq_t * n_tokens_lower + 4 * lane_id;
-  int32_t t_load = n_tokens_lower, load_stage_id = 0;
-  auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
-  auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
-  int32_t stage_offset{};
-
-  static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
-  static constexpr int32_t LOAD_STAGE_MOD =
-      NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
-
-  // Two halves of all threads in a block conduct global loads for gate and up,
-  // repsectively.
-  auto load_and_advance_y_pred = [&] {
-    if (t_load < n_tokens_upper) {
-      auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
-      auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
-
-      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
-      // unnecessary ALU ops.
-      stage_offset += LOAD_STAGE_SIZE;
-      stage_offset %= LOAD_STAGE_MOD;
-
-      if (tid < HALF_THREAD_COUNT) {
-        cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
-        gate_128_ptr += stride_i_t_128;
-      } else {
-        cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
-        up_128_ptr += stride_i_t_128;
-      }
-      ++t_load;
-      ++load_stage_id;
-    }
-    // We fence even if there is nothing to load to simplify pipelining.
-    cp_async_fence();
-  };
-
-  #pragma unroll
-  for (int i = 0; i < NUM_STAGES - 1; i++) {
-    load_and_advance_y_pred();
-  }
-
-  __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
-                              s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
-                          lane_id;
-  __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
-
-  static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
-  static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
-
-  int32_t compute_pipeline_offset_64 = 0;
-
-  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
-    __nv_bfloat162 results_bf162[2];
-
-    cp_async_wait<NUM_STAGES - 2>();
-    __syncthreads();
-
-    // We double-buffer pipelined loads so that the next load will
-    // concurrently run with compute without overwrites.
-    load_and_advance_y_pred();
-
-    auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
-    auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
-
-    // STAGE_SIZE must also be constexpr!
-    compute_pipeline_offset_64 += STAGE_SIZE;
-    compute_pipeline_offset_64 %= STAGE_MOD;
-
-    // Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
-    __int64_t gate64 = *s_gate_compute_64;
-    __nv_bfloat162* s_gate_compute_32 =
-        reinterpret_cast<__nv_bfloat162*>(&gate64);
-
-    __int64_t up64 = *s_up_compute_64;
-    __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
-
-  #pragma unroll
-    for (int i = 0; i < 2; i++) {
-      // For silu, we make sure that div is emitted.
-      float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
-      results_bf162[i] = __float22bfloat162_rn(gate);
-    }
-
-  #pragma unroll
-    for (int i = 0; i < 2; i++) {
-      results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
-    }
-
-    auto _y_max2 =
-        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
-
-    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
-
-    // An entire group is assigned to a single warp, so a simple warp reduce
-    // is used.
-    __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
-
-    if constexpr (USE_UE8M0) {
-      y_s = hexp2(hceil(hlog2(y_s)));
-    }
-
-    auto inv_y = __float2bfloat16_rn(1.f) / y_s;
-
-    auto y_s2 = make_bfloat162(inv_y, inv_y);
-
-  #pragma unroll
-    for (int32_t i = 0; i < 2; ++i) {
-      results_bf162[i] =
-          clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
-               __bfloat162bfloat162(fp8_max));
-    }
-
-    auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
-    *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
-    y_q_ptr += stride_yq_t;
-
-    if (lane_id == 0) {
-      *y_s_ptr = y_s;
-      y_s_ptr += stride_ys_t;
-    }
-  }
-}
-#endif
-
 }  // namespace vllm

 // Launch activation, gating, and quantize kernel.
@ -474,117 +119,3 @@ void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
  TORCH_CHECK(input.size(-1) % 2 == 0);
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
-
-void silu_mul_fp8_quant_deep_gemm_cuda(
-    const at::Tensor& input,   // (E, T, 2*H)
-    const at::Tensor& counts,  // (E)
-    at::Tensor& y_q,           // (E, T, H) [OUT]
-    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
-#ifndef USE_ROCM
-  // This kernel relies heavily on cp.async and fp8 support.
-  // This kernel currently only supports H % 128 == 0 and assumes a
-  // fixed GROUP_SIZE of 128.
-  TORCH_CHECK(input.dtype() == torch::kBFloat16);
-  TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
-              y_q.dtype() == torch::kFloat8_e4m3fnuz);
-  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
-  TORCH_CHECK(input.size(-1) % 256 == 0);
-
-  // Check that num_parallel_tokens is of power of 2 and between 1 and 64.
-  TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
-  TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
-
-  using Idx_t = int64_t;
-
-  Idx_t E = input.size(0);
-  Idx_t T = input.size(1);
-  Idx_t H = input.size(2) / 2;
-  Idx_t stride_i_e = input.stride(0);
-  Idx_t stride_i_t = input.stride(1);
-  Idx_t stride_i_h = input.stride(2);
-  Idx_t stride_yq_e = y_q.stride(0);
-  Idx_t stride_yq_t = y_q.stride(1);
-  Idx_t stride_yq_h = y_q.stride(2);
-  Idx_t stride_ys_e = y_s.stride(0);
-  Idx_t stride_ys_t = y_s.stride(1);
-  Idx_t stride_ys_g = y_s.stride(2);
-
-  Idx_t stride_counts_e = counts.stride(0);
-
-  static constexpr int GROUP_SIZE = 128;
-
-  #define KERNEL_FN                                                         \
-    if (use_ue8m0) {                                                        \
-      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
-                                                NUM_PARALLEL_TOKENS, true>  \
-          <<<grid, block, 0, stream>>>(                                     \
-              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
-              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
-              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
-              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
-              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
-              stride_counts_e);                                             \
-    } else {                                                                \
-      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
-                                                NUM_PARALLEL_TOKENS, false> \
-          <<<grid, block, 0, stream>>>(                                     \
-              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
-              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
-              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
-              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
-              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
-              stride_counts_e);                                             \
-    }
-
-  #define KERNEL_CALL_H                                       \
-    if (H % (4 * GROUP_SIZE) == 0) {                          \
-      static constexpr int NUM_WARPS = 4;                     \
-      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
-      KERNEL_FN                                               \
-    } else {                                                  \
-      static constexpr int NUM_WARPS = 1;                     \
-      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
-      KERNEL_FN                                               \
-    }
-
-  #define KERNEL_CALL_TOP_LEVEL                      \
-    if (num_parallel_tokens == 1) {                  \
-      static constexpr int NUM_PARALLEL_TOKENS = 1;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 2) {           \
-      static constexpr int NUM_PARALLEL_TOKENS = 2;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 4) {           \
-      static constexpr int NUM_PARALLEL_TOKENS = 4;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 8) {           \
-      static constexpr int NUM_PARALLEL_TOKENS = 8;  \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 16) {          \
-      static constexpr int NUM_PARALLEL_TOKENS = 16; \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 32) {          \
-      static constexpr int NUM_PARALLEL_TOKENS = 32; \
-      KERNEL_CALL_H                                  \
-    } else if (num_parallel_tokens == 64) {          \
-      static constexpr int NUM_PARALLEL_TOKENS = 64; \
-      KERNEL_CALL_H                                  \
-    }
-
-  Idx_t G;
-  dim3 block, grid;
-  auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
-    G = H / Idx_t(group_size * num_warps);
-    grid = dim3(E * G, _num_parallel_tokens);
-    block = dim3(num_warps * WARP_SIZE);
-  };
-
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
-                          "silu_mul_fp8_quant_deep_gemm_kernel",
-                          [&] { KERNEL_CALL_TOP_LEVEL });
-
-#endif
-}
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -7,10 +7,17 @@

 #include <cmath>

-#include "../../cub_helpers.h"
 #include "../../dispatch_utils.h"
 #include "../vectorization_utils.cuh"

+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+  #include <cub/util_type.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+  #include <hipcub/util_type.hpp>
+#endif
+
 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
  static constexpr auto i8_min =
@ -166,7 +173,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
      });
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
-  float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x);
+  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
  __shared__ float absmax;
  if (tid == 0) {
    absmax = block_max;
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@ -25,8 +25,6 @@
 #include "cutlass_extensions/common.hpp"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"

-#include <cuda_runtime.h>
-
 namespace vllm::cutlass_w4a8 {

 using namespace cute;
@ -395,71 +393,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
  return packed_scales;
 }

-/*
-  GPU-accelerated implementation of cutlass::unified_encode_int4b.
-  Constructs a lookup table in constant memory to map 8 bits
-  (two 4-bit values) at a time. Assumes memory is contiguous
-  and pointers are 16-byte aligned.
-*/
-__constant__ uint8_t kNibbleLUT[256];
-
-__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
-                                            size_t nbytes) {
-  constexpr size_t V = sizeof(uint4);  // 16 bytes
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
-  const size_t nvec = nbytes / V;
-
-  // 1-D grid-stride loop over 16-byte chunks
-  for (size_t vec = tid; vec < nvec; vec += nthreads) {
-    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
-    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
-#pragma unroll
-    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
-    reinterpret_cast<uint4*>(out)[vec] = v;
-  }
-}
-
-static bool upload_lut() {
-  std::array<uint8_t, 256> lut{};
-  auto map_nib = [](uint8_t v) -> uint8_t {
-    // 1..7 -> (8 - v); keep 0 and 8..15
-    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
-  };
-  for (int b = 0; b < 256; ++b) {
-    uint8_t lo = b & 0xF;
-    uint8_t hi = (b >> 4) & 0xF;
-    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
-  }
-  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
-                                     /*offset=*/0, cudaMemcpyHostToDevice);
-
-  return (e == cudaSuccess);
-}
-
-static bool unified_encode_int4b(cutlass::int4b_t const* in,
-                                 cutlass::int4b_t* out, size_t num_int4_elems) {
-  // Build/upload LUT
-  if (!upload_lut()) return false;
-
-  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
-                "int4 storage must be 1 byte");
-  const size_t nbytes = num_int4_elems >> 1;
-
-  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
-  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
-
-  // kernel launch params
-  constexpr int block = 256;
-  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
-  int grid = int((nvec + block - 1) / block);
-  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
-
-  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
-  cudaError_t err = cudaGetLastError();
-  return (err == cudaSuccess);
-}
-
 torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
  TORCH_CHECK(B.dtype() == torch::kInt32);
  TORCH_CHECK(B.dim() == 2);
@ -468,7 +401,6 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {

  int k = B.size(0) * PackFactor;  // logical k
  int n = B.size(1);
-  TORCH_CHECK((n * k) % 32 == 0, "need multiples of 32 int4s for 16B chunks");

  auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
  auto B_packed_ptr = static_cast<QuantType*>(B_packed.data_ptr());
@ -477,9 +409,7 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
  LayoutB_Reordered layout_B_reordered =
      cute::tile_to_shape(LayoutAtomQuant{}, shape_B);

-  bool ok =
-      vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
-  TORCH_CHECK(ok, "unified_encode_int4b failed");
+  cutlass::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
  cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);

  return B_packed;
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@ -146,7 +146,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,

  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;
-  using ElementBlockScale = typename Gemm::ElementBlockScale;

  int32_t m = a.size(0), n = b.size(1), k = a.size(1);

@ -167,29 +166,26 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

-  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
-  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
-  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());

-  typename GemmKernel::MainloopArguments mainloop_args{};
-  mainloop_args.layout_SFA = layout_SFA;
-  mainloop_args.layout_SFB = layout_SFB;
-  if (swap_ab) {
-    mainloop_args.ptr_A = b_ptr;
-    mainloop_args.dA = b_stride;
-    mainloop_args.ptr_B = a_ptr;
-    mainloop_args.dB = a_stride;
-    mainloop_args.ptr_SFA = b_scales_ptr;
-    mainloop_args.ptr_SFB = a_scales_ptr;
-  } else {
-    mainloop_args.ptr_A = a_ptr;
-    mainloop_args.dA = a_stride;
-    mainloop_args.ptr_B = b_ptr;
-    mainloop_args.dB = b_stride;
-    mainloop_args.ptr_SFA = a_scales_ptr;
-    mainloop_args.ptr_SFB = b_scales_ptr;
-  }
+  auto mainloop_args = [&](){
+    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+    if (swap_ab) {
+      return typename GemmKernel::MainloopArguments{
+          b_ptr,        b_stride,   a_ptr,        a_stride,
+          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
+      };
+    }
+    else {
+      return typename GemmKernel::MainloopArguments{
+          a_ptr,        a_stride,   b_ptr,        b_stride,
+          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+      };
+    }
+  }();
  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
@ -231,7 +227,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
      } else {
        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
            out, a, b, a_scales, b_scales);
      }
@ -245,7 +241,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
      } else {
        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
            out, a, b, a_scales, b_scales);
      }
@ -259,7 +255,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
      } else {
          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
            out, a, b, a_scales, b_scales);
      }
@ -271,10 +267,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
    // TMA epilogue isn't compatible with Swap A/B
    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
-        Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
        out, a, b, a_scales, b_scales);
  }
 }

-}  // namespace vllm
+}  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@ -125,7 +125,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,

  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;
-  using ElementBlockScale = typename Gemm::ElementBlockScale;

  int32_t m = a.size(0), n = b.size(1), k = a.size(1);

@ -144,20 +143,17 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  LayoutSFB layout_SFB = 
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

-  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
-  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
-  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());

-  typename GemmKernel::MainloopArguments mainloop_args{};
-  mainloop_args.ptr_A = a_ptr;
-  mainloop_args.dA = a_stride;
-  mainloop_args.ptr_B = b_ptr;
-  mainloop_args.dB = b_stride;
-  mainloop_args.ptr_SFA = a_scales_ptr;
-  mainloop_args.layout_SFA = layout_SFA;
-  mainloop_args.ptr_SFB = b_scales_ptr;
-  mainloop_args.layout_SFB = layout_SFB;
+  auto mainloop_args = [&](){
+    return typename GemmKernel::MainloopArguments{
+        a_ptr,        a_stride,   b_ptr,        b_stride,
+        a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+    };
+  }();
  auto prob_shape = cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@ -115,7 +115,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,

  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;
-  using ElementBlockScale = typename Gemm::ElementBlockScale;

  int32_t m = a.size(0), n = b.size(1), k = a.size(1);

@ -136,20 +135,17 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  LayoutSFB layout_SFB = 
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

-  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
-  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
-  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());

-  typename GemmKernel::MainloopArguments mainloop_args{};
-  mainloop_args.ptr_A = a_ptr;
-  mainloop_args.dA = a_stride;
-  mainloop_args.ptr_B = b_ptr;
-  mainloop_args.dB = b_stride;
-  mainloop_args.ptr_SFA = a_scales_ptr;
-  mainloop_args.layout_SFA = layout_SFA;
-  mainloop_args.ptr_SFB = b_scales_ptr;
-  mainloop_args.layout_SFB = layout_SFB;
+  auto mainloop_args = [&](){
+    return typename GemmKernel::MainloopArguments{
+        a_ptr,        a_stride,   b_ptr,        b_stride,
+        a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+    };
+  }();
  auto prob_shape = cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@ -25,10 +25,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
      if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
        int8_func(c, a, b, a_scales, b_scales, bias);
      } else {
-        int32_t version_num = get_sm_version_num();
-        TORCH_CHECK(
-            false, "Int8 not supported on SM", version_num,
-            ". Use FP8 quantization instead, or run on older arch (SM < 100).");
+        TORCH_CHECK(false, "Int8 not supported for this architecture");
      }
    }
  } else {
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
  }
 }

-}  // namespace vllm
+}  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -67,9 +67,8 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias);
 #endif

-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
-    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
+#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
+    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
 void get_cutlass_moe_mm_data_caller(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -254,7 +253,7 @@ void cutlass_moe_mm(
    bool per_act_token, bool per_out_ch) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100 && version_num < 110) {
+  if (version_num >= 100) {
    cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                         expert_offsets, problem_sizes, a_strides, b_strides,
                         c_strides, per_act_token, per_out_ch);
@ -262,7 +261,7 @@ void cutlass_moe_mm(
  }
 #endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90 && version_num < 100) {
+  if (version_num >= 90) {
    cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
                        c_strides, per_act_token, per_out_ch);
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@ -26,46 +26,113 @@
 #include "dispatch_utils.h"

 #include "cuda_utils.h"
-#include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"

 namespace vllm {

-// silu in float32
-__device__ __forceinline__ float silu(float x) {
-  return __fdividef(x, (1.f + __expf(-x)));
-}
-
-__device__ __forceinline__ float2 silu2(float2 x) {
-  return make_float2(silu(x.x), silu(x.y));
-}
-
 template <class Type>
-__inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
-                                                       PackedVec<Type>& vec2) {
+__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
+                                                   PackedVec<Type>& vec2) {
  PackedVec<Type> result;
-  using packed_type = typename TypeConverter<Type>::Type;
-
 #pragma unroll
  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
-    // silu_mul in float32
    if constexpr (std::is_same_v<Type, half>) {
-      float2 silu_vec = silu2(__half22float2(vec.elts[i]));
-      result.elts[i] =
-          __float22half2_rn(__fmul2_rn(silu_vec, __half22float2(vec2.elts[i])));
+      half2 val(0.5f, 0.5f);
+      half2 t0 = __hmul2(vec.elts[i], val);
+      half2 t1 = __hfma2(h2tanh(t0), val, val);
+      half2 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
    } else {
-      float2 silu_vec = silu2(__bfloat1622float2(vec.elts[i]));
-      result.elts[i] = __float22bfloat162_rn(
-          __fmul2_rn(silu_vec, __bfloat1622float2(vec2.elts[i])));
+      __nv_bfloat162 val(0.5f, 0.5f);
+      __nv_bfloat162 t0 = __hmul2(vec.elts[i], val);
+      __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val);
+      __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
    }
  }
  return result;
 }

+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
+                                                  PackedVec<Type>& vec2,
+                                                  float SFScaleVal,
+                                                  uint8_t* SFout) {
+  PackedVec<Type> out_silu = compute_silu(vec, vec2);
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(out_silu.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(out_silu.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+}
+
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
-    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+__global__ void __launch_bounds__(1024, 4)
+    silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                             float const* SFScale, uint32_t* out,
                             uint32_t* SFout) {
  using PackedVec = PackedVec<Type>;
@ -93,18 +160,16 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
      // Get the output tensor offset.
      // Same as inOffset because 8 elements are packed into one uint32_t.
      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      ;
      auto& out_pos = out[outOffset];

-      // Compute silu and mul
-      PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
-
      auto sf_out =
          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                             CVT_FP4_NUM_THREADS_PER_SF>(
              rowIdx, colIdx, numCols, SFout);

-      out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
-                                                     sf_out);
+      out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(
+          in_vec, in_vec2, SFScaleVal, sf_out);
    }
  }
 }
@ -132,15 +197,14 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
-  int const numBlocksPerSM =
-      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  int const numBlocksPerSM = 2048 / block.x;
  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));

  VLLM_DISPATCH_HALF_TYPES(
      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
-        vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
+        vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
            m, n, input_ptr, input_sf_ptr,
            reinterpret_cast<uint32_t*>(output_ptr),
            reinterpret_cast<uint32_t*>(sf_out));
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -14,8 +14,6 @@
 * limitations under the License.
 */

-#include "core/registration.h"
-
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>

@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
      "12.8 or above.");
 #endif
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
-}
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -26,13 +26,12 @@
 #include "dispatch_utils.h"

 #include "nvfp4_utils.cuh"
-#include "launch_bounds_utils.h"

 namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+__global__ void __launch_bounds__(512, 4)
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout,
                    uint32_t* input_offset_by_experts,
@ -130,7 +129,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))

 // Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
+__global__ void __launch_bounds__(1024, 4)
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout,
                    uint32_t* input_offset_by_experts,
@ -234,9 +233,8 @@ void quant_impl(void* output, void* output_scale, void* input,
  int const workSizePerRow = k / ELTS_PER_THREAD;
  int const totalWorkSize = m_topk * workSizePerRow;
  dim3 block(std::min(workSizePerRow, 512));
-  // Get number of blocks per SM
-  int const numBlocksPerSM =
-      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
  dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
                     multiProcessorCount * numBlocksPerSM));
  while (grid.x <= multiProcessorCount && block.x > 64) {
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -26,14 +26,13 @@
 #include "dispatch_utils.h"

 #include "cuda_utils.h"
-#include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"

 namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+__global__ void __launch_bounds__(512, 4)
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
  using PackedVec = PackedVec<Type>;
@ -76,9 +75,8 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
  // Grid, Block size.
  // Each thread converts 8 values.
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM
-  int const numBlocksPerSM =
-      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));

  // Launch the cvt kernel.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Mo	01efc7ef78	[ci] fix wheel names for arm wheels (#24898 ) Signed-off-by: simon-mo <simon.mo@hey.com>	2025-10-07 13:40:13 -07:00
Michael Goin	26b999c71a	[CI Failure] Fix test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe (#24750 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-09-13 09:30:00 -07:00
Wentao Ye	da3fa78dc9	[Compilation Bug] Fix Inductor Graph Output with Shape Issue (#24772 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-09-12 23:03:56 -07:00
Maximilien de Bayser	bbb70036cb	Enable conversion of multimodal models to pooling tasks (#24451 ) Signed-off-by: Max de Bayser <mbayser@br.ibm.com>	2025-09-12 23:02:15 -07:00
Tao He	89da8d9d09	[Qwen3Next] Fixes the cuda graph capture conditions under large batch sizes (#24660 ) (#24667 ) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>	2025-09-12 23:01:49 -07:00
Elvir Crnčević	01085b134d	[Qwen3-Next] MoE configs for H100 TP=1,2 and TP2/EP (#24739 ) Signed-off-by: elvircrn <elvircrn@gmail.com>	2025-09-12 23:01:25 -07:00
Nick Hill	66160a9943	[BugFix] Fix Qwen3-Next PP (#24709 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-09-12 23:00:28 -07:00
Jee Jee Li	eaca762c18	[Qwen3-Next] MoE configs for H20 TP=1,2,4,8 (#24707 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-09-12 23:00:09 -07:00