Use uv to install python in Dockerfile

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-22 16:04:36 +08:00 · 2025-10-02 11:05:47 -04:00
200 changed files with 1809 additions and 3951 deletions
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -181,14 +181,18 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -365,7 +365,8 @@ run_serving_tests() {
      continue
    fi

-    server_command="$server_envs vllm serve \
+    server_command="$server_envs python3 \
+      -m vllm.entrypoints.openai.api_server \
      $server_args"

    # run the server
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -1,191 +0,0 @@
-#!/bin/bash
-
-# This script build the Ascend NPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Base ubuntu image with basic ascend development libraries and python installed
-VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
-CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
-TEST_RUN_CONFIG_FILE="vllm_test.cfg"
-VLLM_ASCEND_TMP_DIR=
-# Get the test run configuration file from the vllm-ascend repository
-fetch_vllm_test_cfg() {
-    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
-    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
-    cleanup() {
-        rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    }
-    trap cleanup EXIT
-
-    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
-    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
-        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-
-    # If the file already exists locally, just overwrite it
-    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
-    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
-
-    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
-    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
-    rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    trap - EXIT
-}
-
-# Downloads test run configuration file from a remote URL.
-# Loads the configuration into the current script environment.
-get_config() {
-    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
-        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-    source "${TEST_RUN_CONFIG_FILE}"
-    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
-    return 0
-}
-
-# get test running configuration.
-fetch_vllm_test_cfg
-get_config
-# Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
-  exit 1
-fi
-
-image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
-container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
-agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
-echo "agent_idx: ${agent_idx}"
-builder_name="cachebuilder${agent_idx}"
-builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
-
-# Try building the docker image
-cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
-    --progress=plain --load -t ${image_name} -f - .
-FROM ${BASE_IMAGE_NAME}
-
-# Define environments
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
-    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
-    apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install for pytest to make the docker build cache layer always valid
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install pytest>=6.0  modelscope
-
-WORKDIR /workspace/vllm
-
-# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-COPY requirements/common.txt /workspace/vllm/requirements/common.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements/common.txt
-
-COPY . .
-
-# Install vLLM
-RUN --mount=type=cache,target=/root/.cache/pip \
-    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
-    python3 -m pip uninstall -y triton
-
-# Install vllm-ascend
-WORKDIR /workspace
-ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
-ARG VLLM_ASCEND_TAG=main
-RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
-    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
-
-# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r /workspace/vllm-ascend/requirements.txt
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
-    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
-    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV VLLM_USE_MODELSCOPE=True
-
-WORKDIR /workspace/vllm-ascend
-
-CMD ["/bin/bash"]
-
-EOF
-
-# Setup cleanup
-remove_docker_container() {
-  docker rm -f "${container_name}" || true;
-  docker image rm -f "${image_name}" || true;
-  docker system prune -f || true;
-}
-trap remove_docker_container EXIT
-
-# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
-# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
-#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
-parse_and_gen_devices() {
-    local input="$1"
-    local index cards_num
-    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
-        index="${BASH_REMATCH[1]}"
-        cards_num="${BASH_REMATCH[2]}"
-    else
-        echo "parse error" >&2
-        return 1
-    fi
-
-    local devices=""
-    local i=0
-    while (( i < cards_num )); do
-        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
-        ((i++))
-    done
-
-    # trim leading space
-    devices="${devices#"${devices%%[![:space:]]*}"}"
-    # Output devices: assigned to the caller variable
-    printf '%s' "$devices"
-}
-
-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
-
-# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
-# This test checks whether the OOT platform interface is functioning properly in conjunction with
-# the hardware plugin vllm-ascend.
-model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
-docker run \
-    ${devices} \
-    --device /dev/davinci_manager \
-    --device /dev/devmm_svm \
-    --device /dev/hisi_hdc \
-    -v /usr/local/dcmi:/usr/local/dcmi \
-    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
-    --entrypoint="" \
-    --name "${container_name}" \
-    "${image_name}" \
-    bash -c '
-    set -e
-    pytest -v -s tests/e2e/vllm_interface/
-'
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?

 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -2,7 +2,6 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
-    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@ -15,7 +14,6 @@ pull_request_rules:
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
-    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@ -32,7 +30,6 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@ -49,7 +46,6 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
-    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@ -59,7 +55,6 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@ -75,7 +70,6 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@ -89,7 +83,6 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
-    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@ -101,7 +94,6 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
@ -115,7 +107,6 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@ -130,7 +121,6 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
@ -152,7 +142,6 @@ pull_request_rules:
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
-    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@ -173,7 +162,6 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@ -193,7 +181,6 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@ -209,7 +196,6 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@ -222,7 +208,6 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
-    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@ -238,7 +223,6 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
-    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@ -253,7 +237,6 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
-    - label != stale
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
@ -272,9 +255,8 @@ pull_request_rules:

 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-    - label != stale
-    - conflict
-    - -closed
+      - conflict
+      - -closed
  actions:
    label:
      add:
@ -288,8 +270,6 @@ pull_request_rules:

 - name: assign reviewer for tensorizer changes
  conditions:
-    - label != stale
-    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
@ -301,7 +281,6 @@ pull_request_rules:

 - name: assign reviewer for modelopt changes
  conditions:
-    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -316,8 +295,8 @@ pull_request_rules:

 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-    - -conflict
-    - -closed
+      - -conflict
+      - -closed
  actions:
    label:
      remove:
@ -326,7 +305,6 @@ pull_request_rules:
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/online_serving/disaggregated[^/]*/.*
      - files~=^examples/offline_inference/disaggregated[^/]*/.*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  endif()
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv

@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -55,7 +55,9 @@ benchmark() {
  output_len=$2


-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -38,12 +38,16 @@ wait_for_server() {
 launch_chunked_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
    --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,174 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-import random
-import time
-
-import torch
-from tabulate import tabulate
-
-from vllm import _custom_ops as ops
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.utils import (
-    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
-    create_kv_caches_with_random,
-)
-
-logger = init_logger(__name__)
-
-
-@torch.inference_mode()
-def run_benchmark(
-    num_tokens: int,
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: str,
-    num_iters: int,
-    benchmark_mode: str,
-    device: str = "cuda",
-) -> float:
-    """Return latency (seconds) for given num_tokens."""
-
-    if kv_cache_dtype == "fp8" and head_size % 16:
-        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
-
-    current_platform.seed_everything(42)
-    torch.set_default_device(device)
-
-    # create random key / value tensors [T, H, D].
-    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
-    value = torch.randn_like(key)
-
-    # prepare the slot mapping.
-    # each token is assigned a unique slot in the KV-cache.
-    num_slots = block_size * num_blocks
-    if num_tokens > num_slots:
-        raise ValueError("num_tokens cannot exceed the total number of cache slots")
-    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
-
-    key_caches, value_caches = create_kv_caches_with_random(
-        num_blocks,
-        block_size,
-        1,  # num_layers
-        num_heads,
-        head_size,
-        kv_cache_dtype,
-        dtype,
-        device=device,
-    )
-    key_cache, value_cache = key_caches[0], value_caches[0]
-    # to free unused memory
-    del key_caches, value_caches
-
-    # compute per-kernel scaling factors for fp8 conversion (if used).
-    k_scale = (key.amax() / 64.0).to(torch.float32)
-    v_scale = (value.amax() / 64.0).to(torch.float32)
-
-    function_under_test = lambda: ops.reshape_and_cache(
-        key,  # noqa: F821
-        value,  # noqa: F821
-        key_cache,  # noqa: F821
-        value_cache,  # noqa: F821
-        slot_mapping,  # noqa: F821
-        kv_cache_dtype,
-        k_scale,
-        v_scale,
-    )
-
-    if benchmark_mode == "cudagraph":
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g):
-            function_under_test()
-        torch.cuda.synchronize()
-        function_under_test = lambda: g.replay()
-
-    def run_cuda_benchmark(n_iters: int) -> float:
-        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
-        start = time.perf_counter()
-        for _ in range(n_iters):
-            function_under_test()
-            torch.cuda.synchronize()
-        end = time.perf_counter()
-        return (end - start) / n_iters
-
-    # warm-up
-    run_cuda_benchmark(3)
-
-    lat = run_cuda_benchmark(num_iters)
-
-    # free tensors to mitigate OOM when sweeping
-    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
-
-    return lat
-
-
-def main(args):
-    rows = []
-    for exp in range(1, 17):
-        n_tok = 2**exp
-        lat = run_benchmark(
-            num_tokens=n_tok,
-            num_heads=args.num_heads,
-            head_size=args.head_size,
-            block_size=args.block_size,
-            num_blocks=args.num_blocks,
-            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-            kv_cache_dtype=args.kv_cache_dtype,
-            num_iters=args.iters,
-            benchmark_mode=args.mode,
-            device="cuda",
-        )
-        rows.append([n_tok, lat * 1e6])  # convert to microseconds
-
-    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
-    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-
-    parser.add_argument("--num-heads", type=int, default=128)
-    parser.add_argument(
-        "--head-size",
-        type=int,
-        choices=[64, 80, 96, 112, 120, 128, 192, 256],
-        default=128,
-    )
-    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
-    parser.add_argument("--num-blocks", type=int, default=128 * 128)
-
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        choices=["half", "bfloat16", "float"],
-        default="bfloat16",
-    )
-
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        choices=["auto", "fp8"],
-        default="auto",
-    )
-
-    parser.add_argument("--iters", type=int, default=200)
-
-    parser.add_argument(
-        "--mode",
-        type=str,
-        choices=["cudagraph", "no_graph"],
-        default="cudagraph",
-    )
-
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -9,7 +9,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import (
@ -63,7 +63,7 @@ def benchmark_shape(m: int,

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_triton_block_scaled_mm(A_vllm,
+        return w8a8_block_fp8_matmul(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
                                     B_scale_vllm,
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -17,6 +17,8 @@
 #include <algorithm>
 #include <cassert>
 #include <cfloat>  // FLT_MIN
+#include <map>
+#include <vector>

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -208,20 +210,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,

 namespace vllm {

-// Used to copy/convert one element
-template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
-struct CopyWithScaleOp {
-  float scale;
-
-  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      dst = static_cast<OutT>(src);
-    } else {
-      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
-    }
-  }
-};
-
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -237,51 +225,59 @@ __global__ void reshape_and_cache_kernel(
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  if (slot_idx < 0) {
+    // Padding token that should be ignored.
    return;
  }

  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
-  const int h_block_count = head_size / x;  // head_size//x

-  const int h_block_idx = threadIdx.x;
-  if (h_block_idx >= num_heads * h_block_count) {
-    return;
-  }
+  const int n = num_heads * head_size;
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+    const int64_t src_key_idx = token_idx * key_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;

-  const int head_idx = h_block_idx / h_block_count;
-  const int h_block = h_block_idx % h_block_count;
+    const int head_idx = i / head_size;
+    const int head_offset = i % head_size;
+    const int x_idx = head_offset / x;
+    const int x_offset = head_offset % x;

-  const scalar_t* __restrict__ key_src =
-      key + token_idx * key_stride + head_idx * head_size + h_block * x;
-  const int64_t src_value_start =
-      token_idx * value_stride + head_idx * head_size + h_block * x;
-
-  cache_t* __restrict__ key_dst =
-      key_cache + block_idx * num_heads * h_block_count * block_size * x +
-      head_idx * h_block_count * block_size * x + h_block * block_size * x +
-      block_offset * x;
-  const int64_t tgt_value_start =
-      block_idx * num_heads * h_block_count * x * block_size +
-      head_idx * h_block_count * x * block_size + h_block * x * block_size +
-      block_offset;
-
-  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
-  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
-  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
-
-  vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
-
-  const scalar_t* __restrict__ value_src = value + src_value_start;
-  cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
-#pragma unroll
-  for (int i = 0; i < x; i++) {
-    v_op(value_dst[i * block_size], value_src[i]);
+    const int64_t tgt_key_idx =
+        block_idx * num_heads * (head_size / x) * block_size * x +
+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
+        block_offset * x + x_offset;
+    const int64_t tgt_value_idx =
+        block_idx * num_heads * head_size * block_size +
+        head_idx * head_size * block_size + head_offset * block_size +
+        block_offset;
+    scalar_t tgt_key = key[src_key_idx];
+    scalar_t tgt_value = value[src_value_idx];
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      key_cache[tgt_key_idx] = tgt_key;
+      value_cache[tgt_value_idx] = tgt_value;
+    } else {
+      key_cache[tgt_key_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
+      value_cache[tgt_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+    }
  }
 }

+// Used by vectorization_utils to copy/convert one element
+template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
+struct CopyWithScaleOp {
+  float scale;
+
+  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      dst = static_cast<OutT>(src);
+    } else {
+      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+    }
+  }
+};
+
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -428,80 +424,84 @@ __global__ void concat_and_cache_ds_mla_kernel(
  const int64_t dst_idx_start =
      block_idx * block_stride + block_offset * entry_stride;

-  // For the NoPE part, each tile of 128 elements is handled by half of one warp
-  // (16 threads). There are 4 total tiles, so 2 warps (64 threads).
-  // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
-  // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
-  // So in total, we use 3 warps (96 threads) per block.
+  // Create 4 tile scales in shared memory
+  __shared__ float smem[20];
+  float* shard_abs_max = smem;
+  float* tile_scales = smem + 16;
+
+  // For the NoPE part, each tile of 128 elements is handled by 4 warps
+  // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
+  // The first thread of the first warp in each tile writes the scale
+  // value for the tile. The RoPE part (last 64 elements) is handled
+  // by another 2 warps (64 threads).
+  // So in total, we use 18 warps (576 threads) per block.

  // Cast kv_cache to 16_bit for RoPE values
  scalar_t* kv_cache_16bit =
      reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);

-  // The last warp handles the RoPE part
-  if (threadIdx.x >= 64) {
-    // Each thread handles two elements of RoPE
-    const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
-    const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
-    // Vectorized load of two 16-bit values, performed as one 32-bit load
-    const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
+  // The last 64 threads handle the RoPE part
+  if (threadIdx.x >= kv_lora_rank) {
+    const int8_t pe_idx = threadIdx.x - kv_lora_rank;
+    const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
    // RoPE values start after the packed 8-bit NoPE values and the
    // 32-bit scales
-    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
-    // Vectorized store of two 16-bit values, performed as one 32-bit store
-    *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
+    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
+    kv_cache_16bit[dst_idx] = k_pe[src_idx];
    return;
  }

-  // The first two warps handle the NoPE part
-  const int8_t warp_idx = threadIdx.x >> 5;
-  const int8_t lane_idx = threadIdx.x & 31;
-  const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
+  // Determine the scale for each chunk of NoPE
+  const int16_t tile_idx = threadIdx.x >> 7;
+  const int16_t warp_idx = (threadIdx.x & 127) >> 5;
+  const int16_t lane_idx = threadIdx.x & 31;

-  // Each thread handles 8 elements of NoPE
-  // Load the NoPE elements for this thread into registers
-  const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
-  // Vectorized load of eight 16-bit values, performed as an int4 load
-  const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
-  const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
+  // Load the NoPE element for this thread into registers
+  const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
+  const scalar_t src_val = kv_c[src_idx];

-  // Max absolute value of this thread's elements
-  float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
-                              fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
-                        fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
-                              fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
-
-  // Warp-level reduction to find the max absolute value in each half-warp
+  // Warp-level reduction to find the max absolute value in the warp
+  float max_abs = fabsf(src_val);
 #pragma unroll
-  for (int offset = 8; offset > 0; offset /= 2) {
-    max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
+  for (int offset = 16; offset > 0; offset /= 2) {
+#ifdef USE_ROCM
+    max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
+#else
+    max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
+#endif
  }

-  // Compute the scale for the tile
-  float tile_scale = max_abs / 448.f;
+  // The first lane of each warp in each tile writes the max_abs of this part
+  // of the tile to shared memory
+  if (lane_idx == 0) {
+    shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
+  }
+  __syncthreads();

-  // The first lane of each half-warp writes the scale to kv_cache
-  if ((lane_idx == 0) || (lane_idx == 16)) {
+  // The first lane of the first warp in each tile computes the scale for the
+  // tile and writes it to shared memory and to kv_cache
+  if (warp_idx == 0 && lane_idx == 0) {
+    float4 shard_abs_max_vec =
+        reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
+    float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
+                             fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
+                       448.f;
+
+    // Avoid division by zero in `scaled_convert`
+    tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
    float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
    const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
-    kv_cache_32bit[dst_idx] = tile_scale;
+    kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
  }

-  // Now all threads in the block scale and write their elements
-  // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
-  const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
+  __syncthreads();

-  uint8_t result[8];
-#pragma unroll
-  for (int i = 0; i < 8; i++) {
-    result[i] =
-        fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
-            vals[i], tile_scale);
-  }
-
-  // Store as aligned 64-bit writes
-  *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
-      *reinterpret_cast<const uint64_t*>(result);
+  // Now all threads in the block scale and write their element
+  const float scale_val = tile_scales[tile_idx];
+  const int64_t dst_idx = dst_idx_start + threadIdx.x;
+  kv_cache[dst_idx] =
+      fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
+          src_val, scale_val);
 }

 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@ -606,10 +606,9 @@ void reshape_and_cache(

  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
-  int head_div_x = head_size / x;

  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_div_x, 512));
+  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

@ -742,12 +741,13 @@ void concat_and_cache_mla(

  if (kv_cache_dtype == "fp8_ds_mla") {
    dim3 grid(num_tokens);
-    // For the NoPE part, each tile of 128 elements is handled by half of one
-    // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
-    // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
-    // The RoPE part (last 64 elements) is handled by another 1 warp (32
-    // threads). So in total, we use 3 warps (96 threads) per block.
-    dim3 block(96);
+    // For the NoPE part, each tile of 128 elements is handled by 4 warps
+    // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
+    // The first thread of the first warp in each tile writes the scale
+    // value for the tile. The RoPE part (last 64 elements) is handled
+    // by another 2 warps (64 threads).
+    // So in total, we use 18 warps (576 threads) per block.
+    dim3 block(576);
    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
                               CALL_CONCAT_AND_CACHE_DS_MLA);
  } else {
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -254,7 +254,7 @@ void cutlass_moe_mm(
    bool per_act_token, bool per_out_ch) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100 && version_num < 110) {
+  if (version_num >= 100) {
    cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                         expert_offsets, problem_sizes, a_strides, b_strides,
                         c_strides, per_act_token, per_out_ch);
@ -262,7 +262,7 @@ void cutlass_moe_mm(
  }
 #endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90 && version_num < 100) {
+  if (version_num >= 90) {
    cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
                        c_strides, per_act_token, per_out_ch);
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -14,8 +14,6 @@
 * limitations under the License.
 */

-#include "core/registration.h"
-
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>

@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
      "12.8 or above.");
 #endif
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
-}
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
      {stride_tag});
-  // conditionally compiled so impl registration is in source file
+  ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);

  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization, as well as bias
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -13,13 +13,8 @@ ARG PYTHON_VERSION=3.12
 # private registries that use a different repository naming conventions.
 #
 # Example:
-# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-
-# Important: We build with an old version of Ubuntu to maintain broad 
-# compatibility with other Linux OSes. The main reason for this is that the
-# glibc version is baked into the distro, and binaries built with one glibc
-# version are not backwards compatible with OSes that use an earlier version.
-ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04

@ -80,20 +75,15 @@ ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
 ENV DEBIAN_FRONTEND=noninteractive

+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL

-# Install system dependencies and uv, then create Python virtual environment
+# Install minimal dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
-    && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
-    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
-    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
-    && ln -s /opt/venv/bin/pip /usr/bin/pip \
-    && python3 --version && python3 -m pip --version
+    && apt-get install -y ccache software-properties-common git curl sudo

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -101,9 +91,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER

-# Activate virtual environment and add uv to PATH
-ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
-ENV VIRTUAL_ENV="/opt/venv"
+# Install uv and Python
+COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
+RUN uv python install ${PYTHON_VERSION} --default --verbose

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -132,7 +122,7 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    uv pip install --system -r requirements/cuda.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # cuda arch list used by torch
@ -162,7 +152,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    uv pip install --system -r requirements/build.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 COPY . .
@ -259,7 +249,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
+    uv pip install --system -r requirements/dev.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################

@ -286,32 +276,12 @@ ARG GET_PIP_URL
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install Python and other dependencies
+# Install minimal dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
-        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
-            mkdir -p -m 0755 /etc/apt/keyrings ; \
-            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
-            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
-            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
-        fi ; \
-    else \
-        for i in 1 2 3; do \
-            add-apt-repository -y ppa:deadsnakes/ppa && break || \
-            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-        done ; \
-    fi \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -319,9 +289,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER

-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install uv and Python
+COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
+RUN uv python install ${PYTHON_VERSION} --default --verbose

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -555,5 +525,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]

 FROM vllm-openai-base AS vllm-openai

-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl

-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -314,4 +314,4 @@ WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -309,4 +309,4 @@ USER 2000
 WORKDIR /home/vllm

 # Set the default entrypoint
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -661,7 +661,8 @@ Benchmark the performance of multi-modal requests in vLLM.
 Start vLLM:

 ```bash
-vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dtype bfloat16 \
  --limit-mm-per-prompt '{"image": 1}' \
  --allowed-local-media-path /path/to/sharegpt4v/images
@ -687,7 +688,8 @@ vllm bench serve \
 Start vLLM:

 ```bash
-vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dtype bfloat16 \
  --limit-mm-per-prompt '{"video": 1}' \
  --allowed-local-media-path /path/to/sharegpt4video/videos
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -258,21 +258,17 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            self,
            seq_len: int,
            mm_counts: Mapping[str, int],
-            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
        ) -> MultiModalDataDict:
            num_images = mm_counts.get("image", 0)

            target_width, target_height = \
                self.info.get_image_size_with_most_features()

-            image_overrides = mm_options.get("image") if mm_options else None
-
            return {
                "image":
                self._get_dummy_images(width=target_width,
                                    height=target_height,
-                                    num_images=num_images,
-                                    overrides=image_overrides)
+                                    num_images=num_images)
            }
        ```

@ -442,20 +438,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            self,
            seq_len: int,
            mm_counts: Mapping[str, int],
-            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
        ) -> MultiModalDataDict:
            target_width, target_height = \
                self.info.get_image_size_with_most_features()
            num_images = mm_counts.get("image", 0)

-            image_overrides = mm_options.get("image") if mm_options else None
-
            return {
                "image":
                self._get_dummy_images(width=target_width,
                                    height=target_height,
-                                    num_images=num_images,
-                                    overrides=image_overrides)
+                                    num_images=num_images)
            }
        ```

--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -39,7 +39,8 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example

 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile \
-    vllm serve meta-llama/Meta-Llama-3-70B
+    python -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3-70B
 ```

 vllm bench command:
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@ -19,7 +19,8 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
 1. Start the vLLM server with the supported chat completion model, e.g.

    ```bash
-    vllm serve mistralai/Mistral-7B-Instruct-v0.2
+    python -m vllm.entrypoints.openai.api_server \
+        --model mistralai/Mistral-7B-Instruct-v0.2
    ```

 1. Call it with AutoGen:
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
        For example:

        ```console
-        vllm serve <model> --host 0.0.0.0 --port 8000
+        python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
        ```

 3. Start the Open WebUI Docker container:
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@ -32,7 +32,6 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
      ports: 8081  # Expose to internet traffic.

    envs:
-      PYTHONUNBUFFERED: 1
      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

@ -48,8 +47,9 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
    run: |
      conda activate vllm
      echo 'Starting vllm api server...'
-      vllm serve $MODEL_NAME \
+      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
+        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log &
@ -131,7 +131,6 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
      ports: 8081  # Expose to internet traffic.

    envs:
-      PYTHONUNBUFFERED: 1
      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

@ -147,8 +146,9 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
    run: |
      conda activate vllm
      echo 'Starting vllm api server...'
-      vllm serve $MODEL_NAME \
+      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
+        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log
@ -243,7 +243,6 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
      ports: 8081  # Expose to internet traffic.

    envs:
-      PYTHONUNBUFFERED: 1
      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

@ -259,8 +258,9 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
    run: |
      conda activate vllm
      echo 'Starting vllm api server...'
-      vllm serve $MODEL_NAME \
+      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
+        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -69,11 +69,6 @@ Sometimes you may see the API server entrypoint used directly instead of via the
 python -m vllm.entrypoints.openai.api_server --model <model>
 ```

-!!! warning
-
-    `python -m vllm.entrypoints.openai.api_server` is deprecated
-    and may become unsupported in a future release.
-
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.

 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -8,9 +8,6 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup

 !!! tip
    When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
-
-    Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP redirects from being followed to bypass domain restrictions.
-
    This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.

 ## Offline Inference
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@ -64,7 +64,8 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
 When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.

 ```bash
-VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
+VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen3-0.6B \
  --enable-sleep-mode \
  --port 8000
 ```
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -48,9 +48,10 @@ The following code configures vLLM in an offline mode to use speculative decodin
 To perform the same with an online mode launch the server:

 ```bash
-vllm serve facebook/opt-6.7b \
+python -m vllm.entrypoints.openai.api_server \
    --host 0.0.0.0 \
    --port 8000 \
+    --model facebook/opt-6.7b \
    --seed 42 \
    -tp 1 \
    --gpu_memory_utilization 0.8 \
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@ -67,7 +67,8 @@ docker run -it \
 XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:

 ```bash
-vllm serve facebook/opt-13b \
+python -m vllm.entrypoints.openai.api_server \
+     --model=facebook/opt-13b \
     --dtype=bfloat16 \
     --max_model_len=1024 \
     --distributed-executor-backend=mp \
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -17,12 +17,12 @@ These models are what we list in [supported-text-models][supported-text-models]

 ### Transformers

-vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
+vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".

 Currently, the Transformers backend works for the following:

 - Modalities: embedding models, language models and vision-language models*
- Architectures: encoder-only, decoder-only, mixture-of-experts
+- Architectures: encoder-only, decoder-only
 - Attention types: full attention and/or sliding attention

 _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
@ -31,7 +31,6 @@ If the Transformers model implementation follows all the steps in [writing a cus

 - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
-    - Data parallel
    - Pipeline parallel
    - Tensor parallel

@ -677,7 +676,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
-| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@ -66,9 +66,6 @@ Restrict domains that vLLM can access for media URLs by setting
 `--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
 (e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)

-Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP
-redirects from being followed to bypass domain restrictions.
-
 ## Security and Firewalls: Protecting Exposed vLLM Systems

 While vLLM is designed to allow unsafe network services to be isolated to
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -576,7 +576,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:

 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "internlm/Intern-S1-mini"
+    model_name = "internlm/Intern-S1"

    engine_args = EngineArgs(
        model=model_name,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -309,7 +309,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:


 def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "internlm/Intern-S1-mini"
+    model_name = "internlm/Intern-S1"

    engine_args = EngineArgs(
        model=model_name,
--- a/examples/online_serving/sagemaker-entrypoint.sh
+++ b/examples/online_serving/sagemaker-entrypoint.sh
@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
 done < <(env | grep "^${PREFIX}")

 # Pass the collected arguments to the main entrypoint
-exec vllm serve "${ARGS[@]}"
+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@ -1,2 +1,2 @@
 lmcache
-nixl >= 0.6.0 # Required for disaggregated prefill
+nixl >= 0.5.1 # Required for disaggregated prefill
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -11,8 +11,8 @@ import pytest
 import torch

 from tests.quantization.utils import is_quant_method_supported
+from tests.v1.attention.utils import _Backend
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                         PassConfig)
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -8,11 +8,11 @@ import torch._dynamo

 from tests.compile.backend import LazyInitPass, TestBackend
 from tests.models.utils import check_outputs_equal
-from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata)
 from vllm import LLM, SamplingParams
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention, AttentionMetadata
-from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -756,7 +756,7 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: Union[list[str], list[torch.Tensor], list[int]],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@ -86,16 +86,3 @@ def test_max_model_len():
        # It can be less if generation finishes due to other reasons (e.g., EOS)
        # before reaching the absolute model length limit.
        assert num_total_tokens <= max_model_len
-
-
-def test_log_stats():
-    llm = LLM(
-        model=MODEL_NAME,
-        disable_log_stats=False,
-        gpu_memory_utilization=0.10,
-        enforce_eager=True,  # reduce test time
-    )
-    outputs = llm.generate(PROMPTS, sampling_params=None)
-
-    # disable_log_stats is False, every output should have metrics
-    assert all(output.metrics is not None for output in outputs)
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -122,9 +122,6 @@ def mock_serving_setup():
                                                 models,
                                                 request_logger=None)

-    serving_completion._process_inputs = AsyncMock(return_value=(MagicMock(
-        name="engine_request"), {}))
-
    return mock_engine, serving_completion


--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -698,22 +698,6 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
        )


-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_system_message_with_tools(client: OpenAI, model_name: str):
-    from vllm.entrypoints.harmony_utils import get_system_message
-
-    # Test with custom tools enabled - commentary channel should be available
-    sys_msg = get_system_message(with_custom_tools=True)
-    valid_channels = sys_msg.content[0].channel_config.valid_channels
-    assert "commentary" in valid_channels
-
-    # Test with custom tools disabled - commentary channel should be removed
-    sys_msg = get_system_message(with_custom_tools=False)
-    valid_channels = sys_msg.content[0].channel_config.valid_channels
-    assert "commentary" not in valid_channels
-
-
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_full_history(client: OpenAI, model_name: str):
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -7,7 +7,7 @@ import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
-from unittest.mock import AsyncMock, MagicMock
+from unittest.mock import MagicMock

 import pytest
 import pytest_asyncio
@ -230,7 +230,6 @@ class MockHFConfig:
@dataclass
 class MockModelConfig:
    task = "generate"
-    runner_type = "generate"
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
@ -245,33 +244,11 @@ class MockModelConfig:
    encoder_config = None
    generation_config: str = "auto"
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    skip_tokenizer_init = False

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}


-def _build_serving_chat(engine: AsyncLLM,
-                        model_config: MockModelConfig) -> OpenAIServingChat:
-    models = OpenAIServingModels(engine_client=engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=model_config)
-    serving_chat = OpenAIServingChat(engine,
-                                     model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
-
-    async def _fake_process_inputs(request_id, engine_prompt, sampling_params,
-                                   *, lora_request, trace_headers, priority):
-        return dict(engine_prompt), {}
-
-    serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
-    return serving_chat
-
-
@dataclass
 class MockEngine:

@ -305,7 +282,16 @@ async def test_serving_chat_returns_correct_model_name():
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

-    serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
    messages = [{"role": "user", "content": "what is 1+1?"}]

    async def return_model_name(*args):
@ -332,7 +318,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

-    serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    req = ChatCompletionRequest(
        model=MODEL_NAME,
@ -366,7 +361,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False

    # Initialize the serving chat
-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    # Test Case 1: No max_tokens specified in request
    req = ChatCompletionRequest(
@ -411,7 +415,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False

    # Initialize the serving chat
-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    # Test case 1: No max_tokens specified, defaults to context_window
    req = ChatCompletionRequest(
@ -458,7 +471,16 @@ async def test_serving_chat_could_load_correct_generation_config():
    mock_engine.errored = False

    # Initialize the serving chat
-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    req = ChatCompletionRequest(
        model=MODEL_NAME,
@ -503,7 +525,17 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    # Test cache_salt
    req = ChatCompletionRequest(
@ -517,12 +549,10 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
    # By default, cache_salt in the engine prompt is not set
    with suppress(Exception):
        await serving_chat.create_chat_completion(req)
-    engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
-    assert "cache_salt" not in engine_prompt
+    assert "cache_salt" not in mock_engine.generate.call_args.args[0]

    # Test with certain cache_salt
    req.cache_salt = "test_salt"
    with suppress(Exception):
        await serving_chat.create_chat_completion(req)
-    engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
-    assert engine_prompt.get("cache_salt") == "test_salt"
+    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@ -1,129 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from contextlib import AsyncExitStack
-from unittest.mock import AsyncMock, MagicMock
-
-import pytest
-import pytest_asyncio
-
-from vllm.entrypoints.context import ConversationContext
-from vllm.entrypoints.openai.protocol import ResponsesRequest
-from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
-from vllm.entrypoints.tool_server import ToolServer
-
-
-class MockConversationContext(ConversationContext):
-    """Mock conversation context for testing"""
-
-    def __init__(self):
-        self.init_tool_sessions_called = False
-        self.init_tool_sessions_args = None
-        self.init_tool_sessions_kwargs = None
-
-    def append_output(self, output) -> None:
-        pass
-
-    async def call_tool(self):
-        return []
-
-    def need_builtin_tool_call(self) -> bool:
-        return False
-
-    def render_for_completion(self):
-        return []
-
-    async def init_tool_sessions(self, tool_server, exit_stack, request_id,
-                                 mcp_tools):
-        self.init_tool_sessions_called = True
-        self.init_tool_sessions_args = (tool_server, exit_stack, request_id,
-                                        mcp_tools)
-
-    async def cleanup_session(self) -> None:
-        pass
-
-
-@pytest.fixture
-def mock_serving_responses():
-    """Create a mock OpenAIServingResponses instance"""
-    serving_responses = MagicMock(spec=OpenAIServingResponses)
-    serving_responses.tool_server = MagicMock(spec=ToolServer)
-    return serving_responses
-
-
-@pytest.fixture
-def mock_context():
-    """Create a mock conversation context"""
-    return MockConversationContext()
-
-
-@pytest.fixture
-def mock_exit_stack():
-    """Create a mock async exit stack"""
-    return MagicMock(spec=AsyncExitStack)
-
-
-class TestInitializeToolSessions:
-    """Test class for _initialize_tool_sessions method"""
-
-    @pytest_asyncio.fixture
-    async def serving_responses_instance(self):
-        """Create a real OpenAIServingResponses instance for testing"""
-        # Create minimal mocks for required dependencies
-        engine_client = MagicMock()
-        engine_client.get_model_config = AsyncMock()
-
-        model_config = MagicMock()
-        model_config.hf_config.model_type = "test"
-        model_config.get_diff_sampling_param.return_value = {}
-
-        models = MagicMock()
-
-        tool_server = MagicMock(spec=ToolServer)
-
-        # Create the actual instance
-        instance = OpenAIServingResponses(
-            engine_client=engine_client,
-            model_config=model_config,
-            models=models,
-            request_logger=None,
-            chat_template=None,
-            chat_template_content_format="auto",
-            tool_server=tool_server,
-        )
-
-        return instance
-
-    @pytest.mark.asyncio
-    async def test_initialize_tool_sessions(self, serving_responses_instance,
-                                            mock_context, mock_exit_stack):
-        """Test that method works correctly with only MCP tools"""
-
-        request = ResponsesRequest(input="test input", tools=[])
-
-        # Call the method
-        await serving_responses_instance._initialize_tool_sessions(
-            request, mock_context, mock_exit_stack)
-        assert mock_context.init_tool_sessions_called is False
-
-        # Create only MCP tools
-        tools = [
-            {
-                "type": "web_search_preview"
-            },
-            {
-                "type": "code_interpreter",
-                "container": {
-                    "type": "auto"
-                }
-            },
-        ]
-
-        request = ResponsesRequest(input="test input", tools=tools)
-
-        # Call the method
-        await serving_responses_instance._initialize_tool_sessions(
-            request, mock_context, mock_exit_stack)
-
-        # Verify that init_tool_sessions was called
-        assert mock_context.init_tool_sessions_called
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@ -10,9 +10,8 @@ from unittest.mock import patch
 import pytest
 import torch

-from vllm.attention.backends.registry import _Backend
 from vllm.attention.layer import MultiHeadAttention
-from vllm.attention.selector import _cached_get_attn_backend
+from vllm.attention.selector import _Backend, _cached_get_attn_backend
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
+    cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import (fp8_gemm_nt,
@ -91,8 +91,7 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):

    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
                                       out_dtype)
-    out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
-                                      out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)

    rel_diff = (torch.mean(
        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@ -20,11 +20,9 @@ from vllm.platforms import current_platform
        (8, 513, 64),  # Non-divisible (native only)
    ])
@pytest.mark.parametrize("seed", [42])
-@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
 def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
-                                      group_size: int, seed: int,
-                                      use_ue8m0: bool) -> None:
+                                      group_size: int, seed: int) -> None:
    """Test QuantFP8 group quantization with various configurations.

    Tests both CUDA and native implementations, column-major scales,
@ -40,8 +38,7 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
    group_shape = GroupShape(1, group_size)
    quant_op = QuantFP8(static=False,
                        group_shape=group_shape,
-                        column_major_scales=False,
-                        use_ue8m0=use_ue8m0)
+                        column_major_scales=False)

    # 1. Test native implementation (always available)
    x_quant_native, scales_native = quant_op.forward_native(x.clone())
@ -51,15 +48,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
    # 2. Test column-major scales configuration
    quant_op_col = QuantFP8(static=False,
                            group_shape=group_shape,
-                            column_major_scales=True,
-                            use_ue8m0=use_ue8m0)
+                            column_major_scales=True)
    _, scales_col = quant_op_col.forward_native(x.clone())
-    assert scales_col.shape == (batch_size, expected_num_groups)
-    assert scales_col.stride(0) == 1
-    assert scales_col.stride(1) == batch_size
-
-    # Test column-major scales consistency
-    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
+    assert scales_col.shape == (expected_num_groups, batch_size)

    # 3. Test CUDA implementation (only for divisible dimensions)
    if is_divisible:
@ -77,23 +68,21 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,


@pytest.mark.parametrize("seed", [42])
-@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
-def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
+def test_quantfp8_group_multidimensional(seed: int) -> None:
    current_platform.seed_everything(seed)

    group_size = 64

    # Test with 3D input
-    batch1, batch2, hidden_dim = 4, 8, 1024
+    batch1, batch2, hidden_dim = 4, 8, 512
    x_3d = torch.randn(
        (batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8

    group_shape = GroupShape(1, group_size)
    quant_op = QuantFP8(static=False,
                        group_shape=group_shape,
-                        column_major_scales=False,
-                        use_ue8m0=use_ue8m0)
+                        column_major_scales=False)

    x_quant, scales = quant_op.forward_native(x_3d.clone())
    assert x_quant.shape == x_3d.shape
@ -102,10 +91,9 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
    # Test column_major_scales with multi-dim
    quant_op_col = QuantFP8(static=False,
                            group_shape=group_shape,
-                            column_major_scales=True,
-                            use_ue8m0=use_ue8m0)
+                            column_major_scales=True)
    _, scales_col = quant_op_col.forward_native(x_3d.clone())
-    assert scales_col.shape == (batch1, batch2, hidden_dim // group_size)
+    assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)

    # Test with 4D input
    batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@ -15,10 +15,10 @@ from torch._prims_common import TensorLikeType

 from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-from vllm.attention.backends.registry import _Backend
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import (
    moe_kernel_quantize_input)
+from vllm.platforms.interface import _Backend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@ -17,6 +17,8 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
 from vllm.model_executor.layers.layernorm import (RMSNorm,
                                                  dispatch_rocm_rmsnorm_func,
                                                  fused_add_rms_norm, rms_norm)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform

 RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
@ -109,6 +111,34 @@ def test_enabled_ops_invalid(env: str):
            RMSNorm(1024).enabled()


+@pytest.mark.skipif(
+    not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
+    reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
+@pytest.mark.parametrize("use_cutlass", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
+def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
+                                  use_rocm_aiter_gemm_w8a8_blockscale: str,
+                                  monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
+                       use_rocm_aiter_gemm_w8a8_blockscale)
+
+    use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
+        int(use_rocm_aiter_gemm_w8a8_blockscale)))
+    block_scale_func = dispatch_w8a8_blockscale_func(
+        use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
+    if use_cutlass:
+        assert block_scale_func == cutlass_scaled_mm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_gemm_w8a8_blockscale):
+        assert block_scale_func == (
+            torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
+    else:
+        assert block_scale_func == w8a8_block_fp8_matmul
+
+
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -240,12 +240,12 @@ def test_distributed_correctness(
    num_logprobs: int,
 ) -> None:
    with vllm_runner(model, tensor_parallel_size=1,
-                     max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                     max_num_seqs=2) as vllm_model:
        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model, tensor_parallel_size=2,
-                     max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                     max_num_seqs=2) as vllm_model:
        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -12,8 +12,6 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

 from vllm.config import ModelConfig
-from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
-                                    ImageDummyOptions, VideoDummyOptions)
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
@ -114,26 +112,12 @@ def _test_processing_correctness(

    processing_info = factories.info(ctx)
    supported_mm_limits = processing_info.get_supported_mm_limits()
-    # Keep integer limits for local data generation
-    limit_mm_per_prompt_ints = {
+    limit_mm_per_prompt = {
        modality: 3 if limit is None else limit
        for modality, limit in supported_mm_limits.items()
    }

-    def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
-        if modality == "video":
-            return VideoDummyOptions(count=count)
-        if modality == "image":
-            return ImageDummyOptions(count=count)
-        if modality == "audio":
-            return AudioDummyOptions(count=count)
-        return BaseDummyOptions(count=count)
-
-    # Assign normalized DummyOptions to the model config
-    model_config.get_multimodal_config().limit_per_prompt = {
-        modality: _to_dummy_options(modality, count)
-        for modality, count in limit_mm_per_prompt_ints.items()
-    }
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt

    baseline_processor = factories.build_processor(ctx, cache=None)
    cached_processor = factories.build_processor(ctx, cache=cache)
@ -166,7 +150,7 @@ def _test_processing_correctness(
            k:
            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
             for _ in range(rng.randint(limit + 1))]
-            for k, limit in limit_mm_per_prompt_ints.items()
+            for k, limit in limit_mm_per_prompt.items()
        }

        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@ -17,23 +17,23 @@ def test_profiling(model_id: str, max_model_len: int):
    model_config_kwargs = {
        "max_model_len": max_model_len,
    }
-    mm_counts = {"image": 1}
    ctx = build_model_context(
        model_id,
        model_config_kwargs=model_config_kwargs,
-        limit_mm_per_prompt=mm_counts,
+        limit_mm_per_prompt={"image": 1},
    )

+    mm_config = ctx.get_mm_config()
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
    profiler = MultiModalProfiler(processor)

    decoder_dummy_data = profiler.get_decoder_dummy_data(
        max_model_len,
-        mm_counts=mm_counts,
+        mm_counts=mm_config.limit_per_prompt,
    )
    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
        max_model_len,
-        mm_counts=mm_counts,
+        mm_counts=mm_config.limit_per_prompt,
    )

    hf_config = ctx.get_hf_config(Llama4Config)
@ -58,7 +58,7 @@ def test_profiling(model_id: str, max_model_len: int):

    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
        max_model_len,
-        mm_counts=mm_counts,
+        mm_counts=mm_config.limit_per_prompt,
    )

    assert total_tokens == profiled_tokens["image"]
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -15,8 +15,6 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
-from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
-                                    ImageDummyOptions, VideoDummyOptions)
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
@ -238,20 +236,7 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
        modality: 3 if limit is None else limit
        for modality, limit in supported_mm_limits.items()
    }
-
-    def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
-        if modality == "video":
-            return VideoDummyOptions(count=count)
-        if modality == "image":
-            return ImageDummyOptions(count=count)
-        if modality == "audio":
-            return AudioDummyOptions(count=count)
-        return BaseDummyOptions(count=count)
-
-    model_config.get_multimodal_config().limit_per_prompt = {
-        modality: _to_dummy_options(modality, count)
-        for modality, count in limit_mm_per_prompt.items()
-    }
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
    processor = factories.build_processor(ctx, cache=None)

    with initialize_dummy_model(model_cls, model_config) as model:
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@ -37,5 +37,4 @@ def test_multimodal_processor(model_id):
        hf_processor_mm_kwargs={},
    )

-    assert (str_processed_inputs["prompt_token_ids"]
-            == ids_processed_inputs["prompt_token_ids"])
+    assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -661,10 +661,6 @@ _TRANSFORMERS_BACKEND_MODELS = {
    "TransformersForSequenceClassification": _HfExamplesInfo("papluca/xlm-roberta-base-language-detection", min_transformers_version="4.57.0.dev0"),  # noqa: E501
    "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
    "TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
-    "TransformersMoEForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924", min_transformers_version="4.57.0.dev0"),  # noqa: E501
-    "TransformersMoEForMultimodalLM": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="4.57.0.dev0"),  # noqa: E501
-    "TransformersMoEEmbeddingModel": _HfExamplesInfo("Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0"),  # noqa: E501
-    "TransformersMoEForSequenceClassification": _HfExamplesInfo("Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0"),  # noqa: E501
 }

 _EXAMPLE_MODELS = {
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@ -66,7 +66,6 @@ def check_implementation(
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
        ("hmellor/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
-        ("allenai/OLMoE-1B-7B-0924", "transformers"),  # MoE
    ])  # trust_remote_code=True by default
 def test_models(
    hf_runner: type[HfRunner],
@ -75,14 +74,6 @@ def test_models(
    model: str,
    model_impl: str,
 ) -> None:
-    import transformers
-    from packaging.version import Version
-    installed = Version(transformers.__version__)
-    required = Version("4.57.0.dev0")
-    if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
-        pytest.skip("MoE models with the Transformers backend require "
-                    f"transformers>={required}, but got {installed}")
-
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -430,26 +430,17 @@ def dummy_hf_overrides(

    update_dict = {
        "num_layers": num_layers,
+        "num_experts": num_experts,
+        "num_experts_per_tok": 2,
+        "num_local_experts": num_experts,
+        # Otherwise there will not be any expert layers
+        "first_k_dense_replace": 0,
+        # To avoid OOM on DeepSeek-V3
+        "n_routed_experts": num_experts,
        # For Gemma-3n
        "num_kv_shared_layers": 1,
    }

-    class DummyConfig:
-        hf_text_config = text_config
-
-    # Only set MoE related config when the model has MoE layers.
-    # Otherwise all models detected as MoE by _get_transformers_backend_cls.
-    if ModelConfig.get_num_experts(DummyConfig) > 0:
-        update_dict.update({
-            "num_experts": num_experts,
-            "num_experts_per_tok": 2,
-            "num_local_experts": num_experts,
-            # Otherwise there will not be any expert layers
-            "first_k_dense_replace": 0,
-            # To avoid OOM on DeepSeek-V3
-            "n_routed_experts": num_experts,
-        })
-
    # Update num_hidden_layers for non-Longcat architectures
    if model_arch != "LongcatFlashForCausalLM" \
            and model_arch != "LongCatFlashMTPModel":
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@ -3,7 +3,6 @@

 import json
 import os
-from typing import Optional

 import pytest

@ -21,10 +20,9 @@ os.environ["FLASHINFER_NVCC_THREADS"] = "16"
 dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}


-def can_initialize(model: str, extra_args: Optional[list[str]] = None):
+def can_initialize(model: str, extra_args: list[str]):

    # Server arguments
-    extra_args = extra_args if extra_args is not None else []
    server_args = [
        "--max-model-len",
        "2048",
@ -67,7 +65,7 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])


@pytest.mark.skip(reason="Works, but takes too long to run")
@ -75,21 +73,21 @@ def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])


@pytest.mark.skip(reason="Works, but takes too long to run")
 def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])


@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
 def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])


 ## DeepSeekV3 ##
@ -97,37 +95,21 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):

 def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
-    can_initialize("deepseek-ai/DeepSeek-V3.1")
-
-
-@pytest.mark.skip(reason=("Known issue: lack of kernel support. "
-                          "Expected failure: assert self.block_quant is None"))
-def test_deepseek_fp8_block_moe_flashinfer_cutlass(
-        monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
-    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("deepseek-ai/DeepSeek-V3.1")
-
-
-def test_deepseek_fp8_block_moe_flashinfer_trtllm(
-        monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
-    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("deepseek-ai/DeepSeek-V3.1")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", [])


 def test_deepseek_nvfp4_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])


@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
 def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])


 ## GPT-OSS ##
@ -135,16 +117,16 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):

 def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
-    can_initialize("openai/gpt-oss-20b")
+    can_initialize("openai/gpt-oss-20b", [])


 def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
-    can_initialize("openai/gpt-oss-20b")
+    can_initialize("openai/gpt-oss-20b", [])


 def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
-    can_initialize("openai/gpt-oss-20b")
+    can_initialize("openai/gpt-oss-20b", [])
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -18,9 +18,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
-from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    W8A8BlockFp8LinearOp)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@ -745,35 +742,3 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
        perplexity = llm.generate_prompt_perplexity([prompt])[0]
        print(perplexity)
        assert perplexity <= exp_perplexity
-
-
-def test_compressed_tensors_fp8_block_enabled(vllm_runner):
-    model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
-    with vllm_runner(model_path) as llm:
-
-        fp8_dtype = current_platform.fp8_dtype()
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
-            assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear,
-                              W8A8BlockFp8LinearOp)
-
-            assert qkv_proj.weight.dtype is fp8_dtype
-            assert qkv_proj.weight_scale.dtype is torch.float32
-            assert len(qkv_proj.weight.shape) == 2
-            assert len(qkv_proj.weight_scale.shape) == 2
-
-            input_quant_op = \
-                qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
-            assert isinstance(input_quant_op, QuantFP8)
-            assert input_quant_op._forward_method == input_quant_op.forward_cuda
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
-        assert output
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@ -786,43 +786,13 @@ def test_model_specification(parser_with_config, cli_config_file,
        parser_with_config.parse_args(['serve', '--config', cli_config_file])

    # Test using --model option raises error
-    # with pytest.raises(
-    #         ValueError,
-    #         match=
-    #     ("With `vllm serve`, you should provide the model as a positional "
-    #      "argument or in a config file instead of via the `--model` option."),
-    # ):
-    #     parser_with_config.parse_args(['serve', '--model', 'my-model'])
-
-    # Test using --model option back-compatibility
-    # (when back-compatibility ends, the above test should be uncommented
-    # and the below test should be removed)
-    args = parser_with_config.parse_args([
-        'serve',
-        '--tensor-parallel-size',
-        '2',
-        '--model',
-        'my-model',
-        '--trust-remote-code',
-        '--port',
-        '8001',
-    ])
-    assert args.model is None
-    assert args.tensor_parallel_size == 2
-    assert args.trust_remote_code is True
-    assert args.port == 8001
-
-    args = parser_with_config.parse_args([
-        'serve',
-        '--tensor-parallel-size=2',
-        '--model=my-model',
-        '--trust-remote-code',
-        '--port=8001',
-    ])
-    assert args.model is None
-    assert args.tensor_parallel_size == 2
-    assert args.trust_remote_code is True
-    assert args.port == 8001
+    with pytest.raises(
+            ValueError,
+            match=
+        ("With `vllm serve`, you should provide the model as a positional "
+         "argument or in a config file instead of via the `--model` option."),
+    ):
+        parser_with_config.parse_args(['serve', '--model', 'my-model'])

    # Test other config values are preserved
    args = parser_with_config.parse_args([
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@ -8,11 +8,11 @@ import pytest
 import torch
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention

-from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
                                      create_standard_kv_cache_spec,
                                      create_vllm_config,
                                      get_attention_backend)
-from vllm.attention.backends.registry import _Backend
 from vllm.config import ModelConfig
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@ -6,12 +6,12 @@ from typing import Optional, Union
 import pytest
 import torch

-from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
                                      create_standard_kv_cache_spec,
                                      create_vllm_config,
                                      get_attention_backend)
 from vllm import _custom_ops as ops
-from vllm.attention.backends.registry import _Backend
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@ -22,7 +22,6 @@ from vllm.utils import cdiv
 from vllm.v1.attention.backends.mla.flashmla_sparse import (
    FlashMLASparseBackend, FlashMLASparseDecodeAndContextMetadata,
    FlashMLASparseImpl, FlashMLASparseMetadata)
-from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks

 SPARSE_BACKEND_BATCH_SPECS = {
    name: BATCH_SPECS[name]
@ -425,24 +424,3 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
                               sdpa_reference,
                               rtol=0.5,
                               atol=0.5)
-
-
-@pytest.mark.parametrize(
-    "seq_lens,max_buf,start,expected",
-    [
-        # Basic split: totals per chunk ≤ max_buf
-        (torch.tensor([2, 3, 4, 2]), 5, 0, [(0, 2), (2, 3), (3, 4)]),
-        # Non-zero start index
-        (torch.tensor([2, 3, 4, 2]), 5, 1, [(1, 2), (2, 3), (3, 4)]),
-        # Exact fits should split between items when adding the next would
-        # overflow
-        (torch.tensor([5, 5, 5]), 5, 0, [(0, 1), (1, 2), (2, 3)]),
-        # All requests fit in a single chunk
-        (torch.tensor([1, 1, 1]), 10, 0, [(0, 3)]),
-        # Large buffer with non-zero start
-        (torch.tensor([4, 4, 4]), 100, 1, [(1, 3)]),
-    ],
-)
-def test_split_prefill_chunks(seq_lens, max_buf, start, expected):
-    out = split_prefill_chunks(seq_lens, max_buf, start)
-    assert out == expected
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -8,11 +8,10 @@ from typing import Optional, Union
 import pytest
 import torch

-from vllm.attention.backends.registry import _Backend
 from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
                         LoadConfig, ModelConfig, ModelDType, ParallelConfig,
                         SchedulerConfig, VllmConfig)
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
--- a/tests/v1/e2e/test_context_length.py
+++ b/tests/v1/e2e/test_context_length.py
@ -1,91 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-end-to-end tests for context length corner cases of vLLM v1 model runner
-versus HuggingFace's transformers.
-
-This test verifies the following behavior: allow a prefill that fills the
-model's maximum context length and then request a single new token.
-
-Test strategy
- Build a textual prompt that tokenizes to exactly ``max_model_len`` tokens.
- Run vLLM generation requesting a single new token (max_tokens=1).
- Run HF generation on the same prompt requesting a single token too.
- Assert both return the same number of generated tokens and the same ids.
-
-"""
-
-import pytest
-import torch
-from transformers import AutoModelForCausalLM
-
-from tests.models.utils import check_outputs_equal
-from tests.utils import create_new_process_for_each_test
-from vllm import LLM, SamplingParams
-from vllm.inputs import TokensPrompt
-
-
-@create_new_process_for_each_test()
-@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
-@pytest.mark.parametrize("max_model_len", [2048])
-@pytest.mark.parametrize("max_tokens", [1])
-def test_prefill_max_context_length(
-    model: str,
-    max_model_len: int,
-    max_tokens: int,
-) -> None:
-    """Compare vLLM and HuggingFace when the prompt already fills the
-    model's maximum context length and we request a single new token.
-
-    The test ensures vLLM does not raise the "Sampled token IDs exceed the
-    max model length" assertion and that both vLLM and HF produce the same
-    single token when given the same inputs.
-    """
-
-    # Construct a prompt of size max_model_len
-    prompt_ids = [[43] * max_model_len]
-
-    # Generate max_tokens new tokens deterministically.
-    sampling_params = [
-        SamplingParams(max_tokens=max_tokens, temperature=0.0, ignore_eos=True)
-    ]
-
-    # --- vLLM generation ---
-    llm = LLM(
-        model=model,
-        tokenizer=model,
-        max_num_seqs=1,
-        tensor_parallel_size=1,
-    )
-
-    vllm_token_prompts = [TokensPrompt(prompt_token_ids=prompt_ids[0])]
-    vllm_results = llm.generate(vllm_token_prompts, sampling_params)
-
-    vllm_output_ids = vllm_results[0].outputs[0].token_ids
-
-    # --- HuggingFace generation ---
-    with torch.no_grad():
-        hf_model = AutoModelForCausalLM.from_pretrained(model)
-
-        # HF expects a tensor of input ids shaped (batch, seq_len).
-        hf_input_tokens = torch.tensor(prompt_ids[0]).unsqueeze(0)
-
-        # Generate max_tokens new tokens deterministically.
-        hf_generated = hf_model.generate(
-            hf_input_tokens,
-            do_sample=False,
-            min_new_tokens=max_tokens,
-            max_new_tokens=max_tokens,
-        )
-
-        # HF returns the prompt + generated tokens. Slice off the prompt.
-        hf_output_ids = hf_generated.cpu().tolist()[0][len(prompt_ids[0]):]
-
-    # check that vLLM outputs (token ids) match HF outputs
-    # Note: for simplicity don't pass detokenized string
-    check_outputs_equal(
-        outputs_0_lst=[(hf_output_ids, "")],
-        outputs_1_lst=[(vllm_output_ids, "")],
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -123,10 +123,13 @@ def test_structured_output(
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")

+    # Don't use eager execution on TPUs because we want to test for no
+    # recompilation at runtime
+    enforce_eager = bool(not current_platform.is_tpu())
    # Use a single LLM instance for several scenarios to
    # speed up the test suite.
    llm = LLM(model=model_name,
-              enforce_eager=True,
+              enforce_eager=enforce_eager,
              max_model_len=1024,
              structured_outputs_config=dict(backend=backend,
                                             disable_any_whitespace=backend
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@ -57,26 +57,6 @@ def clear_kv_transfer():
        ensure_kv_transfer_shutdown()


-def get_default_xfer_telemetry(xferDurationS: float = 1,
-                               postDurationS: float = 1,
-                               totalBytes: int = 1,
-                               descCount: int = 1) -> dict:
-
-    class AttributeDict(dict):
-        __slots__ = ()
-        __getattr__ = dict.__getitem__
-        __setattr__ = dict.__setitem__  # type: ignore[assignment]
-
-    # We can't instantiate nixlXferTelemetry because it's read only and
-    # ray env does not have NIXL, so we must fake it
-    return AttributeDict(
-        xferDuration=xferDurationS * 1e6,  # in us
-        postDuration=postDurationS * 1e6,  # in us
-        totalBytes=totalBytes,
-        descCount=descCount,
-    )
-
-
 class FakeNixlWrapper:
    """Mock implementation of NixlWrapper for testing.

@ -152,9 +132,6 @@ class FakeNixlWrapper:
    def transfer(self, handle: int) -> str:
        return "PROC"

-    def get_xfer_telemetry(self, handle: int) -> dict:
-        return get_default_xfer_telemetry()
-
    ############################################################
    # Follow are for changing the behavior during testing.
    ############################################################
@ -192,11 +169,6 @@ nixl_agent = FakeNixlWrapper
        with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
            f.write(stub)

-        # Mock nixlXferTelemetry class
-        pkg_root2 = os.path.join(td, "nixl", "_bindings")
-        os.makedirs(pkg_root2, exist_ok=True)
-        with open(os.path.join(pkg_root2, "__init__.py"), "w") as f:
-            f.write("class nixlXferTelemetry: pass")
        # touch parent package
        open(os.path.join(td, "nixl", "__init__.py"), "w").close()
        yield td
@ -603,7 +575,7 @@ def test_kv_connector_stats(dist_init):

    # Verify stats values are recorded
    assert not stats_after_transfer.is_empty()
-    assert stats_after_transfer.num_successful_transfers == 1
+    assert stats_after_transfer.data["num_successful_transfers"] == 1

    # Verify stats are reset after retrieval
    stats_after_reset = connector.get_kv_connector_stats()
@ -627,21 +599,16 @@ def test_kv_connector_stats_aggregation():

    # Record different transfers on each worker
    # Worker 1: 2 transfers
-    stats = get_default_xfer_telemetry()
-    worker1_stats.record_transfer(stats)
-    worker1_stats.record_transfer(stats)
+    worker1_stats.record_transfer()
+    worker1_stats.record_transfer()

    # Worker 2: 1 transfer
-    worker2_stats.record_transfer(stats)
+    worker2_stats.record_transfer()

    # Worker 3: 3 transfers
-    stats = get_default_xfer_telemetry(xferDurationS=2,
-                                       postDurationS=2,
-                                       totalBytes=2,
-                                       descCount=2)
-    worker3_stats.record_transfer(stats)
-    worker3_stats.record_transfer(stats)
-    worker3_stats.record_transfer(stats)
+    worker3_stats.record_transfer()
+    worker3_stats.record_transfer()
+    worker3_stats.record_transfer()

    # Create ModelRunnerOutput instances for each worker
    worker_outputs = []
@ -669,12 +636,7 @@ def test_kv_connector_stats_aggregation():
        aggregated_output.kv_connector_output.kv_connector_stats
    assert isinstance(kv_connector_stats, NixlKVConnectorStats)
    # Number of total transfers across all workers.
-    assert kv_connector_stats.num_successful_transfers == 6
-    # Logging proc, call reduce() to get CLI-friendly stats.
-    cli_stats = kv_connector_stats.reduce()
-    assert cli_stats["Avg xfer time (ms)"] == 1500.0
-    assert cli_stats["Avg post time (ms)"] == 1500.0
-    assert cli_stats["Avg number of descriptors"] == 1.5
+    assert kv_connector_stats.data["num_successful_transfers"] == 6


 def test_multi_kv_connector_stats_aggregation():
@ -687,7 +649,6 @@ def test_multi_kv_connector_stats_aggregation():

    from dataclasses import dataclass

-    # Mock a KVConnectorStats class for testing aggregation over connectors.
    @dataclass
    class FooKVConnectorStats(KVConnectorStats):

@ -715,7 +676,7 @@ def test_multi_kv_connector_stats_aggregation():
        if nixl_count > 0:
            nixl_stats = NixlKVConnectorStats()
            for _ in range(nixl_count):
-                nixl_stats.record_transfer(get_default_xfer_telemetry())
+                nixl_stats.record_transfer()
            data["NixlConnector"] = nixl_stats
        if foo_count > 0:
            foo_stats = FooKVConnectorStats()
@ -751,10 +712,8 @@ def test_multi_kv_connector_stats_aggregation():
    assert isinstance(kv_connector_stats, MultiKVConnectorStats)

    # Validate per-connector totals across workers
-    assert isinstance(kv_connector_stats["NixlConnector"],
-                      NixlKVConnectorStats)
-    assert kv_connector_stats["NixlConnector"].num_successful_transfers == 5
-    assert isinstance(kv_connector_stats["FooConnector"], FooKVConnectorStats)
+    assert kv_connector_stats["NixlConnector"].data[
+        "num_successful_transfers"] == 5
    assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6


@ -796,8 +755,6 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
                "working_dir": working_dir,  # ship fake nixl package
                "env_vars": {
                    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
-                    # TODO: for ray to carry over, remove once we set
-                    "NIXL_TELEMETRY_ENABLE": "1",
                },
            }
            ray.init(runtime_env=runtime_env)
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@ -8,10 +8,10 @@ import pytest
 import torch

 from tests.utils import get_attn_backend_list_based_on_platform
-from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
                                      create_standard_kv_cache_spec,
                                      get_attention_backend)
-from vllm.attention.backends.registry import _Backend
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                         VllmConfig)
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@ -6,10 +6,10 @@ from unittest import mock
 import pytest
 import torch

-from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
                                      create_standard_kv_cache_spec,
                                      get_attention_backend)
-from vllm.attention.backends.registry import _Backend
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                         VllmConfig)
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@ -6,10 +6,9 @@ from typing import Optional

 import torch

-from tests.v1.attention.utils import (create_standard_kv_cache_spec,
+from tests.v1.attention.utils import (_Backend, create_standard_kv_cache_spec,
                                      create_vllm_config,
                                      get_attention_backend)
-from vllm.attention.backends.registry import _Backend
 from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata

--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@ -1,27 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention backend registry"""
-
-import enum
-
-
-class _Backend(enum.Enum):
-    FLASH_ATTN = enum.auto()
-    TRITON_ATTN = enum.auto()
-    XFORMERS = enum.auto()
-    ROCM_FLASH = enum.auto()
-    ROCM_AITER_MLA = enum.auto()
-    ROCM_AITER_FA = enum.auto()  # used for ViT attn backend
-    TORCH_SDPA = enum.auto()
-    FLASHINFER = enum.auto()
-    FLASHINFER_MLA = enum.auto()
-    TRITON_MLA = enum.auto()
-    CUTLASS_MLA = enum.auto()
-    FLASHMLA = enum.auto()
-    FLASH_ATTN_MLA = enum.auto()
-    PALLAS = enum.auto()
-    IPEX = enum.auto()
-    NO_ATTENTION = enum.auto()
-    FLEX_ATTENTION = enum.auto()
-    TREE_ATTN = enum.auto()
-    ROCM_ATTN = enum.auto()
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
-from typing import Callable, List, Optional
+from typing import List, Optional

 import torch
 import torch.nn as nn
@ -10,7 +10,6 @@ import torch.nn.functional as F
 import vllm.envs as envs
 from vllm.attention import AttentionType
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.config import CacheConfig, get_current_vllm_config
@ -27,7 +26,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape)
 from vllm.model_executor.models.vision import get_vit_attn_backend
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import GiB_bytes, direct_register_custom_op

 logger = init_logger(__name__)
@ -68,39 +67,9 @@ def check_upstream_fa_availability(dtype: torch.dtype):
    ) and current_platform.has_device_capability(80):
        from transformers.utils import is_flash_attn_2_available
        return is_flash_attn_2_available()
-    if current_platform.is_rocm():
-        from importlib.util import find_spec
-        return find_spec("flash_attn") is not None
    return False


-def maybe_get_vit_flash_attn_backend(
-        attn_backend: _Backend,
-        use_upstream_fa: bool) -> tuple[_Backend, Callable]:
-    if attn_backend != _Backend.FLASH_ATTN and \
-        attn_backend != _Backend.ROCM_AITER_FA and \
-                check_upstream_fa_availability(torch.get_default_dtype()):
-        attn_backend = _Backend.FLASH_ATTN
-        use_upstream_fa = True
-
-    if current_platform.is_rocm() and \
-        attn_backend == _Backend.FLASH_ATTN:
-        use_upstream_fa = True
-
-    if (attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}):
-        if attn_backend == _Backend.ROCM_AITER_FA:
-            from aiter import flash_attn_varlen_func
-        else:
-            if use_upstream_fa:
-                from flash_attn import flash_attn_varlen_func
-            else:
-                from vllm.vllm_flash_attn import flash_attn_varlen_func
-    else:
-        flash_attn_varlen_func = None
-
-    return attn_backend, flash_attn_varlen_func
-
-
 class Attention(nn.Module, AttentionLayerBase):
    """Attention layer.

@ -440,9 +409,13 @@ class MultiHeadAttention(nn.Module):
        # to upstream flash attention if available.
        # If vllm native fa is selected, we use it directly.
        use_upstream_fa = False
+        if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
+                dtype):
+            backend = _Backend.FLASH_ATTN
+            use_upstream_fa = True

-        if current_platform.is_xpu():
-            # currently, only torch_sdpa is supported on xpu
+        if current_platform.is_rocm() or current_platform.is_xpu():
+            # currently, only torch_sdpa is supported on rocm/xpu
            self.attn_backend = _Backend.TORCH_SDPA
        else:

@ -454,25 +427,17 @@ class MultiHeadAttention(nn.Module):
                _Backend.FLASH_ATTN,
            } else _Backend.TORCH_SDPA

-        self.attn_backend, self._flash_attn_varlen_func \
-            = maybe_get_vit_flash_attn_backend(
-                self.attn_backend,
-                use_upstream_fa,
-            )
-
        if (self.attn_backend == _Backend.XFORMERS
                and not check_xformers_availability()):
            self.attn_backend = _Backend.TORCH_SDPA

-        self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
-        }
-
-        # this condition is just to make sure that the
-        # use_upstream_fa in the log is correct
-        if current_platform.is_rocm() \
-            and self.attn_backend == _Backend.FLASH_ATTN:
-            use_upstream_fa = True
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            if use_upstream_fa:
+                from flash_attn import flash_attn_varlen_func
+                self._flash_attn_varlen_func = flash_attn_varlen_func
+            else:
+                from vllm.vllm_flash_attn import flash_attn_varlen_func
+                self._flash_attn_varlen_func = flash_attn_varlen_func

        logger.info_once(
            f"MultiHeadAttention attn_backend: {self.attn_backend}, "
@ -500,7 +465,7 @@ class MultiHeadAttention(nn.Module):
            key = torch.repeat_interleave(key, num_repeat, dim=2)
            value = torch.repeat_interleave(value, num_repeat, dim=2)

-        if self.is_flash_attn_backend:
+        if self.attn_backend == _Backend.FLASH_ATTN:
            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
                                        step=q_len,
                                        dtype=torch.int32,
@ -541,6 +506,14 @@ class MultiHeadAttention(nn.Module):
            from torch_xla.experimental.custom_kernel import flash_attention
            out = flash_attention(query, key, value, sm_scale=self.scale)
            out = out.transpose(1, 2)
+        elif self.attn_backend == _Backend.ROCM_AITER_FA:
+            from aiter import flash_attn_varlen_func
+
+            # ROCm Flash Attention expects (batch, seq, heads, head_dim)
+            out = flash_attn_varlen_func(query,
+                                         key,
+                                         value,
+                                         softmax_scale=self.scale)
        else:
            # ViT attention hasn't supported this backend yet
            raise NotImplementedError(
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@ -11,9 +11,8 @@ import torch

 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import _Backend
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname

 logger = init_logger(__name__)
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -235,22 +235,16 @@ class BenchmarkDataset(ABC):

        if len(requests) < num_requests:
            random.seed(self.random_seed)
-            needed = num_requests - len(requests)
-            additional = []
-            for i in range(needed):
-                req = deepcopy(random.choice(requests))
+            additional = deepcopy(
+                random.choices(requests, k=num_requests - len(requests))
+            )
+            for i in range(len(additional)):
+                req = additional[i]
                req.request_id = request_id_prefix + str(len(requests) + i)
-                additional.append(req)
            requests.extend(additional)
            logger.info("Oversampled requests to reach %d total samples.",
                        num_requests)

-        ids = [req.request_id for req in requests]
-        if len(ids) != len(set(ids)):
-            raise ValueError("Duplicate request_id found in the sampled "
-                             "requests. Please ensure that each request_id "
-                             "is unique.")
-

 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@ -1157,12 +1151,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
        help="Do not oversample if the dataset has " \
        "fewer samples than num-prompts.",
    )
-    parser.add_argument(
-        "--skip-chat-template",
-        action="store_true",
-        help=
-        "Skip applying chat template to prompt for datasets that support it.",
-    )

    # group for dataset specific arguments
    custom_group = parser.add_argument_group("custom dataset options")
@ -1173,6 +1161,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
        help=
        "Number of output tokens per request, used only for custom dataset.",
    )
+    custom_group.add_argument(
+        "--custom-skip-chat-template",
+        action="store_true",
+        help=
+        "Skip applying chat template to prompt, used only for custom dataset.",
+    )

    spec_bench_group = parser.add_argument_group("spec bench dataset options")
    spec_bench_group.add_argument(
@ -1441,7 +1435,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            output_len=args.custom_output_len,
-            skip_chat_template=args.skip_chat_template,
+            skip_chat_template=args.custom_skip_chat_template,
            request_id_prefix=args.request_id_prefix,
            no_oversample=args.no_oversample,
        )
@ -1582,7 +1576,6 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
            output_len=args.hf_output_len,
            request_id_prefix=args.request_id_prefix,
            no_oversample=args.no_oversample,
-            skip_chat_template=args.skip_chat_template,
            **hf_kwargs
        )

@ -1822,6 +1815,7 @@ class SpecBench(CustomDataset):

    def sample(self, **kwargs) -> list:
        # leverage CustomDataset sample
+        kwargs["skip_chat_template"] = False
        return super().sample(**kwargs)


@ -2227,7 +2221,6 @@ class InstructCoderDataset(HuggingFaceDataset):
               num_requests: int,
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
-               skip_chat_template: bool = False,
               request_id_prefix: str = "",
               no_oversample: bool = False,
               **kwargs) -> list:
@ -2243,15 +2236,14 @@ class InstructCoderDataset(HuggingFaceDataset):
            )

            # apply template
-            if not skip_chat_template:
-                prompt = tokenizer.apply_chat_template(
-                    [{
-                        "role": "user",
-                        "content": prompt
-                    }],
-                    add_generation_prompt=True,
-                    tokenize=False,
-                )
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )

            prompt_len = len(tokenizer(prompt).input_ids)
            sampled_requests.append(
@ -2292,7 +2284,6 @@ class MTBenchDataset(HuggingFaceDataset):
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-        skip_chat_template: bool = False,
        request_id_prefix: str = "",
        no_oversample: bool = False,
        **kwargs,
@ -2307,15 +2298,14 @@ class MTBenchDataset(HuggingFaceDataset):
            prompt = item["turns"][0]

            # apply template
-            if not skip_chat_template:
-                prompt = tokenizer.apply_chat_template(
-                    [{
-                        "role": "user",
-                        "content": prompt
-                    }],
-                    add_generation_prompt=True,
-                    tokenize=False,
-                )
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )

            prompt_len = len(tokenizer(prompt).input_ids)
            sampled_requests.append(
@ -2359,7 +2349,6 @@ class BlazeditDataset(HuggingFaceDataset):
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
-        skip_chat_template: bool = False,
        request_id_prefix: str = "",
        no_oversample: bool = False,
        min_distance: float = 0.0,
@ -2383,7 +2372,7 @@ class BlazeditDataset(HuggingFaceDataset):

            # template copied from
            # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
-            prompt = f"""Given a code file, please apply the change requests and generate the new file.
+            instruction = f"""Given a code file, please apply the change requests and generate the new file.

 Original file:
 ```python
@ -2396,15 +2385,14 @@ Change request:
 Please generate the new code file in the "New file" section below.""" # noqa: E501

            # apply template
-            if not skip_chat_template:
-                prompt = tokenizer.apply_chat_template(
-                    [{
-                        "role": "user",
-                        "content": prompt
-                    }],
-                    add_generation_prompt=True,
-                    tokenize=False,
-                )
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": instruction
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )

            prompt_len = len(tokenizer(prompt).input_ids)

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -20,7 +20,7 @@ from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
                                    MultiModalConfig)
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
-from vllm.config.utils import assert_hashable, config, getattr_iter
+from vllm.config.utils import assert_hashable, config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
@ -276,9 +276,7 @@ class ModelConfig:
    multimodal_config: Optional[MultiModalConfig] = None
    """Configuration for multimodal model. If `None`, this will be inferred
    from the architecture of `self.model`."""
-    limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int,
-                                                          dict[str,
-                                                               int]]]]] = None
+    limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
    media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
    mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
    mm_processor_cache_gb: InitVar[Optional[float]] = None
@ -669,8 +667,6 @@ class ModelConfig:
    def _get_transformers_backend_cls(self) -> str:
        """Determine which Transformers backend class will be used if
        `model_impl` is set to `transformers` or `auto`."""
-        prefix = "Transformers"
-        prefix += "MoE" if self.get_num_experts() > 1 else ""
        # Check if the architecture we're wrapping has defaults
        runner = None
        convert = None
@ -689,15 +685,15 @@ class ModelConfig:
        # Resolve Transformers backend pooling classes
        if runner == "pooling":
            if convert == "embed":
-                return prefix + "EmbeddingModel"
+                return "TransformersEmbeddingModel"
            if convert == "classify":
-                return prefix + "ForSequenceClassification"
+                return "TransformersForSequenceClassification"
        # Resolve Transformers backend generate classes
        if self.hf_config != self.hf_text_config:
            # If 'hf_text_config' is the same as 'hf_config'. If not, it is
            # probably a composite config, i.e. multimodal
-            return prefix + "ForMultimodalLM"
-        return prefix + "ForCausalLM"
+            return "TransformersForMultimodalLM"
+        return "TransformersForCausalLM"

    def using_transformers_backend(self) -> bool:
        """Check if the model is using the Transformers backend class."""
@ -1029,7 +1025,17 @@ class ModelConfig:
            self.enforce_eager = True

    def _verify_with_expert_parallelism(self) -> None:
-        num_experts = self.get_num_experts()
+        num_expert_names = [
+            "moe_num_experts",  # Dbrx
+            "num_experts",  # Jamba
+            "n_routed_experts",  # DeepSeek
+            "num_local_experts",  # Mixtral
+        ]
+        num_experts = 0
+        for name in num_expert_names:
+            num_experts = getattr(self.hf_text_config, name, 0)
+            if num_experts > 0:
+                break
        if num_experts < 1:
            raise ValueError(
                "Number of experts in the model must be greater than 0 "
@ -1214,21 +1220,6 @@ class ModelConfig:
        num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
        return num_heads // parallel_config.tensor_parallel_size

-    def get_num_experts(self) -> int:
-        """Returns the number of experts in the model."""
-        num_expert_names = [
-            "num_experts",  # Jamba
-            "moe_num_experts",  # Dbrx
-            "n_routed_experts",  # DeepSeek
-            "num_local_experts",  # Mixtral
-        ]
-        num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0)
-        if isinstance(num_experts, list):
-            # Ernie VL's remote code uses list[int]...
-            # The values are always the same so we just take the first one.
-            return num_experts[0]
-        return num_experts
-
    def get_layers_start_end_indices(
            self, parallel_config: ParallelConfig) -> tuple[int, int]:
        from vllm.distributed.utils import get_pp_indices
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@ -4,45 +4,15 @@
 import hashlib
 from collections.abc import Mapping
 from dataclasses import field
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal, Optional

-from pydantic import ConfigDict, Field, field_validator
 from pydantic.dataclasses import dataclass

+import vllm.envs as envs
 from vllm.config.utils import config

-
-@dataclass
-class BaseDummyOptions:
-    """Base options for generating dummy data during profiling."""
-    count: int = Field(999, ge=0)
-
-
-@dataclass(config=ConfigDict(extra="forbid"))
-class VideoDummyOptions(BaseDummyOptions):
-    """Options for generating dummy video data during profiling."""
-    num_frames: Optional[int] = Field(None, gt=0)
-    width: Optional[int] = Field(None, gt=0)
-    height: Optional[int] = Field(None, gt=0)
-
-
-@dataclass(config=ConfigDict(extra="forbid"))
-class ImageDummyOptions(BaseDummyOptions):
-    """Options for generating dummy image data during profiling."""
-    width: Optional[int] = Field(None, gt=0)
-    height: Optional[int] = Field(None, gt=0)
-
-
-@dataclass(config=ConfigDict(extra="forbid"))
-class AudioDummyOptions(BaseDummyOptions):
-    """Options for generating dummy audio data during profiling."""
-    length: Optional[int] = Field(None, gt=0)
-
-
 MMEncoderTPMode = Literal["weights", "data"]
 MMCacheType = Literal["shm", "lru"]
-DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions,
-                     AudioDummyOptions]


@config
@ -50,22 +20,12 @@ DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions,
 class MultiModalConfig:
    """Controls the behavior of multimodal models."""

-    limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict)
-    """The maximum number of input items and options allowed per 
-        prompt for each modality.
-    Defaults to 999 for each modality.
+    limit_per_prompt: dict[str, int] = field(default_factory=dict)
+    """The maximum number of input items allowed per prompt for each modality.
+    Defaults to 1 (V0) or 999 (V1) for each modality.

-    Legacy format (count only):
-        {"image": 16, "video": 2}
-
-    Configurable format (with options):
-        {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, 
-        "image": {"count": 5, "width": 512, "height": 512}}
-
-    Mixed format (combining both):
-        {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, 
-        "height": 512}}
-    """
+    For example, to allow up to 16 images and 2 videos per prompt:
+    `{"image": 16, "video": 2}`"""
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    """Additional args passed to process media inputs, keyed by modalities.
    For example, to set num_frames for video, set
@ -124,27 +84,6 @@ class MultiModalConfig:
    from each video to be pruned.
    """

-    @field_validator("limit_per_prompt", mode="before")
-    @classmethod
-    def _validate_limit_per_prompt(
-        cls, value: dict[str, Union[int,
-                                    dict[str,
-                                         int]]]) -> dict[str, DummyOptions]:
-        for k, v in value.items():
-            # Handle legacy format where only count is specified
-            if isinstance(v, int):
-                v = {"count": v}
-            # Convert to the appropriate DummyOptions subclass
-            if k == "video":
-                value[k] = VideoDummyOptions(**v)
-            elif k == "image":
-                value[k] = ImageDummyOptions(**v)
-            elif k == "audio":
-                value[k] = AudioDummyOptions(**v)
-            else:
-                value[k] = BaseDummyOptions(**v)
-        return value
-
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
@ -167,22 +106,12 @@ class MultiModalConfig:
    def get_limit_per_prompt(self, modality: str) -> int:
        """
        Get the maximum number of input items allowed per prompt
-        for the given modality (backward compatible).
+        for the given modality.
        """
-        limit_data = self.limit_per_prompt.get(modality)
-
-        if limit_data is None:
-            # Unspecified modality is set to 999 by default
-            return 999
-        return limit_data.count
-
-    def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]:
-        """
-        Get the configurable dummy data options for a modality.
-        Returns None if no options are configured for this modality.
-        """
-        # All values are now DummyOptions after normalization
-        return self.limit_per_prompt.get(modality)
+        return self.limit_per_prompt.get(
+            modality,
+            999 if envs.VLLM_USE_V1 else 1,
+        )

    def merge_mm_processor_kwargs(
        self,
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -516,23 +516,6 @@ class VllmConfig:
                    " by VLLM_DEBUG_DUMP_PATH to %s", env_path)
            self.compilation_config.debug_dump_path = env_path

-        def has_blocked_weights():
-            if self.quant_config is not None:
-                if hasattr(self.quant_config, "weight_block_size"):
-                    return self.quant_config.weight_block_size is not None
-                elif hasattr(self.quant_config, "has_blocked_weights"):
-                    return self.quant_config.has_blocked_weights()
-            return False
-
-        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
-        # On H100 the CUDA kernel is faster than
-        # native implementation
-        # https://github.com/vllm-project/vllm/issues/25094
-        if has_blocked_weights():
-            custom_ops = self.compilation_config.custom_ops
-            if "none" not in custom_ops and "-quant_fp8" not in custom_ops:
-                custom_ops.append("+quant_fp8")
-
    def update_sizes_for_sequence_parallelism(self,
                                              possible_sizes: list) -> list:
        # remove the sizes that not multiple of tp_size when
--- a/vllm/connections.py
+++ b/vllm/connections.py
@ -54,7 +54,6 @@ class HTTPConnection:
        stream: bool = False,
        timeout: Optional[float] = None,
        extra_headers: Optional[Mapping[str, str]] = None,
-        allow_redirects: bool = True,
    ):
        self._validate_http_url(url)

@ -64,8 +63,7 @@ class HTTPConnection:
        return client.get(url,
                          headers=self._headers(**extra_headers),
                          stream=stream,
-                          timeout=timeout,
-                          allow_redirects=allow_redirects)
+                          timeout=timeout)

    async def get_async_response(
        self,
@ -73,7 +71,6 @@ class HTTPConnection:
        *,
        timeout: Optional[float] = None,
        extra_headers: Optional[Mapping[str, str]] = None,
-        allow_redirects: bool = True,
    ):
        self._validate_http_url(url)

@ -82,17 +79,10 @@ class HTTPConnection:

        return client.get(url,
                          headers=self._headers(**extra_headers),
-                          timeout=timeout,
-                          allow_redirects=allow_redirects)
+                          timeout=timeout)

-    def get_bytes(self,
-                  url: str,
-                  *,
-                  timeout: Optional[float] = None,
-                  allow_redirects: bool = True) -> bytes:
-        with self.get_response(url,
-                               timeout=timeout,
-                               allow_redirects=allow_redirects) as r:
+    def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes:
+        with self.get_response(url, timeout=timeout) as r:
            r.raise_for_status()

            return r.content
@ -102,10 +92,8 @@ class HTTPConnection:
        url: str,
        *,
        timeout: Optional[float] = None,
-        allow_redirects: bool = True,
    ) -> bytes:
-        async with await self.get_async_response(
-                url, timeout=timeout, allow_redirects=allow_redirects) as r:
+        async with await self.get_async_response(url, timeout=timeout) as r:
            r.raise_for_status()

            return await r.read()
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@ -4,7 +4,6 @@ import contextlib
 import copy
 import logging
 import math
-import os
 import queue
 import threading
 import time
@ -21,7 +20,6 @@ import torch
 import zmq

 from vllm import envs
-from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@ -34,7 +32,7 @@ from vllm.distributed.parallel_state import (
 from vllm.distributed.utils import divide
 from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import make_zmq_path, make_zmq_socket
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
@ -55,12 +53,10 @@ logger = init_logger(__name__)
 # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
 try:
    from nixl._api import nixl_agent as NixlWrapper
-    from nixl._bindings import nixlXferTelemetry
    logger.info("NIXL is available")
 except ImportError:
    logger.warning("NIXL is not available")
    NixlWrapper = None
-    nixlXferTelemetry = None

 try:
    from nixl._api import nixl_agent_config
@ -479,9 +475,6 @@ class NixlConnectorWorker:
        self.nixl_backends = \
            vllm_config.kv_transfer_config.get_from_extra_config(
                "backends", ["UCX"])
-        # TODO temporary, once nixl allows for telemetry flag in config
-        # (next release), we can remove this env var.
-        os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
        # Agent.
        non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
        if nixl_agent_config is None:
@ -1181,10 +1174,9 @@ class NixlConnectorWorker:
            for handle, _xfer_stime in handles:
                xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                if xfer_state == "DONE":
-                    # Get telemetry from NIXL
-                    res = self.nixl_wrapper.get_xfer_telemetry(handle)
-                    self.xfer_stats.record_transfer(res)
                    self.nixl_wrapper.release_xfer_handle(handle)
+                    # TODO (NickLucche) Get from NIXL telemetry once integrated
+                    self.xfer_stats.record_transfer()
                elif xfer_state == "PROC":
                    in_progress = True
                    continue
@ -1456,25 +1448,15 @@ class NixlKVConnectorStats(KVConnectorStats):
    """Container for transfer performance metrics"""

    def __post_init__(self):
-        if not self.data:
-            # Empty container init, no data is passed in.
-            self.reset()
+        if "num_successful_transfers" not in self.data:
+            self.data["num_successful_transfers"] = 0

    def reset(self):
-        # Must be serializable
-        self.data: dict[str, list[float]] = {
-            "transfer_duration": [],
-            "post_duration": [],
-            "bytes_transferred": [],
-            "num_descriptors": [],
-        }
+        self.data = {"num_successful_transfers": 0}

-    def record_transfer(self, res: nixlXferTelemetry):
-        # Keep metrics units consistent with rest of the code: time us->s
-        self.data["transfer_duration"].append(res.xferDuration / 1e6)
-        self.data["post_duration"].append(res.postDuration / 1e6)
-        self.data["bytes_transferred"].append(res.totalBytes)
-        self.data["num_descriptors"].append(res.descCount)
+    def record_transfer(self):
+        # TODO: record actual transfer stats when available
+        self.data["num_successful_transfers"] += 1

    def clone_and_reset(self) -> "NixlKVConnectorStats":
        old = copy.copy(self)
@ -1482,55 +1464,16 @@ class NixlKVConnectorStats(KVConnectorStats):
        return old

    def is_empty(self) -> bool:
-        return self.num_successful_transfers == 0
+        return self.data["num_successful_transfers"] == 0

    def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
        if not other.is_empty():
-            for k, v in other.data.items():
-                accumulator = self.data[k]
-                assert isinstance(accumulator, list)
-                accumulator.extend(v)
+            self.data["num_successful_transfers"] += other.data[
+                "num_successful_transfers"]
        return self

    def reduce(self) -> dict[str, Union[int, float]]:
-        # Compute compact representative stats suitable for CLI logging
-        if self.is_empty():
-            return {
-                "Num successful transfers": 0,
-                "Avg xfer time (ms)": 0,
-                "P90 xfer time (ms)": 0,
-                "Avg post time (ms)": 0,
-                "P90 post time (ms)": 0,
-                "Avg MB per transfer": 0,
-                "Throughput (MB/s)": 0,
-                "Avg number of descriptors": 0,
-            }
-
-        xfer_time = np.asarray(self.data["transfer_duration"])
-        post_time = np.asarray(self.data["post_duration"])
-        # Convert to MB for CLI logging.
-        mb = np.asarray(self.data["bytes_transferred"]) / 2**20
-        descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
-        n = len(descs)
-        assert n == self.num_successful_transfers
-
-        total_mb = mb.sum()
-        avg_mb = total_mb / n
-
-        total_time_seconds = xfer_time.sum()
-        throughput_mb_s = total_mb / total_time_seconds
-
+        # TODO: reduce stats to a single value, calculate latency/throughput
        return {
-            "Num successful transfers": n,
-            "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
-            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3),
-            "Avg post time (ms)": round(post_time.mean() * 1e3, 3),
-            "P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3),
-            "Avg MB per transfer": round(avg_mb, 3),
-            "Throughput (MB/s)": round(throughput_mb_s, 3),
-            "Avg number of descriptors": round(descs.mean(), 1),
+            "num_successful_transfers": self.data["num_successful_transfers"]
        }
-
-    @property
-    def num_successful_transfers(self) -> int:
-        return len(self.data["transfer_duration"])
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -376,7 +376,7 @@ class EngineArgs:
    quantization: Optional[QuantizationMethods] = ModelConfig.quantization
    enforce_eager: bool = ModelConfig.enforce_eager
    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
-    limit_mm_per_prompt: dict[str, Union[int, dict[str, int]]] = \
+    limit_mm_per_prompt: dict[str, int] = \
        get_field(MultiModalConfig, "limit_per_prompt")
    interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
    media_io_kwargs: dict[str, dict[str,
@ -1131,10 +1131,6 @@ class EngineArgs:
        device_config = DeviceConfig(
            device=cast(Device, current_platform.device_type))

-        model_config = self.create_model_config()
-        self.model = model_config.model
-        self.tokenizer = model_config.tokenizer
-
        (self.model, self.tokenizer,
         self.speculative_config) = maybe_override_with_speculators(
             model=self.model,
@ -1143,6 +1139,7 @@ class EngineArgs:
             trust_remote_code=self.trust_remote_code,
             vllm_speculative_config=self.speculative_config,
         )
+        model_config = self.create_model_config()

        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
        #   and fall back to V0 for experimental or unsupported features.
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@ -19,7 +19,6 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tasks import SupportedTask
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import Device, collect_from_async_generator, random_uuid
-from vllm.v1.engine import EngineCoreRequest

 logger = init_logger(__name__)

@ -50,16 +49,12 @@ class EngineClient(ABC):
    @abstractmethod
    def generate(
        self,
-        prompt: Union[EngineCoreRequest, PromptType],
+        prompt: PromptType,
        sampling_params: SamplingParams,
        request_id: str,
-        *,
-        prompt_text: Optional[str] = None,
        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
    ) -> AsyncGenerator[RequestOutput, None]:
        """Generate outputs for a request."""
        ...
@ -99,15 +94,10 @@ class EngineClient(ABC):
        #    this happens again in generation, so the double expansion causes
        #    a mismatch.
        # TODO - would be ideal to handle this more gracefully.
-        if isinstance(prompt, str):
-            prompt_text = prompt
-            prompt_token_ids = []
-            multi_modal_data = None
-        else:
-            prompt_text = prompt.get("prompt")
-            prompt_token_ids = prompt.get("prompt_token_ids", [])
-            multi_modal_data = prompt.get("multi_modal_data")
+        prompt_token_ids = prompt.get("prompt_token_ids")
+        multi_modal_data = prompt.get("multi_modal_data")

+        prompt_text = processed_inputs.get("prompt")
        mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")

        tokenized_length = len(prompt_token_ids)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -37,7 +37,6 @@ from vllm.entrypoints.utils import (_validate_truncation_size,
                                    log_non_default_args)
 from vllm.inputs import (DataPrompt, PromptType, SingletonPrompt, TextPrompt,
                         TokensPrompt)
-from vllm.inputs.parse import get_prompt_components
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.quantization import QuantizationMethods
@ -50,13 +49,10 @@ from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
                                  SamplingParams)
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               get_cached_tokenizer,
-                                               init_tokenizer_from_configs)
+                                               get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, as_iter, is_list_of
-from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.llm_engine import LLMEngine
-from vllm.v1.engine.processor import Processor
 from vllm.v1.sample.logits_processor import LogitsProcessor

 if TYPE_CHECKING:
@ -316,10 +312,6 @@ class LLM:
        self.io_processor = get_io_processor(self.llm_engine.vllm_config,
                                             io_processor_plugin)

-    @property
-    def model_config(self):
-        return self.llm_engine.model_config
-
    def get_tokenizer(self) -> AnyTokenizer:
        return self.llm_engine.get_tokenizer()

@ -332,16 +324,6 @@ class LLM:
        else:
            self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)

-    def _get_processor(self) -> Processor:
-        if not hasattr(self, "_processor"):
-            vllm_config = self.llm_engine.vllm_config
-            if self.model_config.skip_tokenizer_init:
-                tokenizer = None
-            else:
-                tokenizer = init_tokenizer_from_configs(self.model_config)
-            self._processor = Processor(vllm_config, tokenizer)
-        return self._processor
-
    def get_default_sampling_params(self) -> SamplingParams:
        if self.default_sampling_params is None:
            self.default_sampling_params = (
@ -1515,6 +1497,8 @@ class LLM:
            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
            it = tqdm_func(it, desc="Adding requests")

+        model_config = self.llm_engine.model_config
+
        for i, prompt in enumerate(it):

            if isinstance(prompt, dict):
@ -1522,9 +1506,17 @@ class LLM:
                    prompt.get("multi_modal_data"),
                    prompt.get("multi_modal_uuids"))

+            param = params[i] if isinstance(params, Sequence) else params
+
+            tokenization_kwargs: dict[str, Any] = {}
+            _validate_truncation_size(model_config.max_model_len,
+                                      param.truncate_prompt_tokens,
+                                      tokenization_kwargs)
+
            self._add_request(
                prompt,
                params[i] if isinstance(params, Sequence) else params,
+                tokenization_kwargs=tokenization_kwargs,
                lora_request=lora_request[i] if isinstance(
                    lora_request, Sequence) else lora_request,
                priority=priority[i] if priority else 0,
@ -1565,58 +1557,22 @@ class LLM:
                    raise ValueError(f"Multi-modal data for {modality} is None"
                                     f" but UUID is not provided")

-    def _process_inputs(
-        self,
-        request_id: str,
-        engine_prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        *,
-        lora_request: Optional[LoRARequest],
-        priority: int,
-    ) -> tuple[EngineCoreRequest, dict[str, Any]]:
-        """Use the Processor to process inputs for LLMEngine."""
-        tokenization_kwargs: dict[str, Any] = {}
-        _validate_truncation_size(self.model_config.max_model_len,
-                                  params.truncate_prompt_tokens,
-                                  tokenization_kwargs)
-
-        processor = self._get_processor()
-        engine_request = processor.process_inputs(
-            request_id,
-            engine_prompt,
-            params,
-            lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
-            priority=priority,
-        )
-        return engine_request, tokenization_kwargs
-
    def _add_request(
        self,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
        lora_request: Optional[LoRARequest] = None,
        priority: int = 0,
    ) -> None:
-        prompt_text, _, _ = get_prompt_components(prompt)
        request_id = str(next(self.request_counter))
-
-        engine_request, tokenization_kwargs = self._process_inputs(
+        self.llm_engine.add_request(
            request_id,
            prompt,
            params,
            lora_request=lora_request,
-            priority=priority,
-        )
-
-        self.llm_engine.add_request(
-            request_id,
-            engine_request,
-            params,
-            lora_request=lora_request,
            tokenization_kwargs=tokenization_kwargs,
            priority=priority,
-            prompt_text=prompt_text,
        )

    def _run_engine(
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -274,8 +274,7 @@ class OpenAIServingChat(OpenAIServing):
        generators: list[AsyncGenerator[RequestOutput, None]] = []
        try:
            for i, engine_prompt in enumerate(engine_prompts):
-                prompt_text, _, _ = (self._get_prompt_components(
-                    request_prompts[i]))
+                sampling_params: Union[SamplingParams, BeamSearchParams]

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}
@ -286,7 +285,6 @@ class OpenAIServingChat(OpenAIServing):
                    input_length=len(engine_prompt["prompt_token_ids"]),
                    default_sampling_params=self.default_sampling_params)

-                sampling_params: Union[SamplingParams, BeamSearchParams]
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
                        max_tokens, self.default_sampling_params)
@ -311,25 +309,13 @@ class OpenAIServingChat(OpenAIServing):
                        lora_request=lora_request,
                    )
                else:
-                    engine_request, tokenization_kwargs = (
-                        await self._process_inputs(
-                            request_id,
-                            engine_prompt,
-                            sampling_params,
-                            lora_request=lora_request,
-                            trace_headers=trace_headers,
-                            priority=request.priority,
-                        ))
-
                    generator = self.engine_client.generate(
-                        engine_request,
+                        engine_prompt,
                        sampling_params,
                        request_id,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
-                        prompt_text=prompt_text,
-                        tokenization_kwargs=tokenization_kwargs,
                    )

                generators.append(generator)
@ -691,13 +677,11 @@ class OpenAIServingChat(OpenAIServing):
                    if self.use_harmony:
                        harmony_parser = harmony_parsers[i]
                        prev_recipient = harmony_parser.current_recipient
-                        delta_text = ""
                        for token_id in output.token_ids:
                            harmony_parser.process(token_id)
-                            delta_text += (harmony_parser.last_content_delta
-                                           or "")
                        cur_channel = harmony_parser.current_channel
                        cur_recipient = harmony_parser.current_recipient
+                        delta_text = harmony_parser.last_content_delta or ""
                    else:
                        delta_text = output.text

@ -1591,9 +1575,7 @@ class OpenAIServingChat(OpenAIServing):
        sys_msg = get_system_message(
            reasoning_effort=request.reasoning_effort,
            browser_description=None,
-            python_description=None,
-            with_custom_tools=request.tools is not None
-            )
+            python_description=None)
        messages.append(sys_msg)

        # Add developer message.
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -9,6 +9,7 @@ from typing import Optional, Union, cast

 import jinja2
 from fastapi import Request
+from typing_extensions import assert_never

 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@ -31,7 +32,8 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import get_max_tokens
-from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
+from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
+                              is_tokens_prompt)
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
@ -155,16 +157,23 @@ class OpenAIServingCompletion(OpenAIServing):
        generators: list[AsyncGenerator[RequestOutput, None]] = []
        try:
            for i, engine_prompt in enumerate(engine_prompts):
-                prompt_text, prompt_token_ids, prompt_embeds = (
-                    self._get_prompt_components(engine_prompt))
-
-                input_length = None
-                if prompt_token_ids is not None:
-                    input_length = len(prompt_token_ids)
-                elif prompt_embeds is not None:
-                    input_length = len(prompt_embeds)
+                sampling_params: Union[SamplingParams, BeamSearchParams]
+                # Mypy does not infer that engine_prompt will have only one of
+                # "prompt_token_ids" or "prompt_embeds" defined, and both of
+                # these as Union[object, the expected type], where it infers
+                # object if engine_prompt is a subclass of one of the
+                # typeddicts that defines both keys. Worse, because of
+                # https://github.com/python/mypy/issues/8586, mypy does not
+                # infer the type of engine_prompt correctly because of the
+                # enumerate. So we need an unnecessary cast here.
+                engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt],
+                                     engine_prompt)
+                if is_embeds_prompt(engine_prompt):
+                    input_length = len(engine_prompt["prompt_embeds"])
+                elif is_tokens_prompt(engine_prompt):
+                    input_length = len(engine_prompt["prompt_token_ids"])
                else:
-                    raise NotImplementedError
+                    assert_never(engine_prompt)

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}
@ -176,7 +185,6 @@ class OpenAIServingCompletion(OpenAIServing):
                    default_sampling_params=self.default_sampling_params,
                )

-                sampling_params: Union[SamplingParams, BeamSearchParams]
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
                        max_tokens, self.default_sampling_params)
@ -212,25 +220,13 @@ class OpenAIServingCompletion(OpenAIServing):
                        lora_request=lora_request,
                    )
                else:
-                    engine_request, tokenization_kwargs = (
-                        await self._process_inputs(
-                            request_id_item,
-                            engine_prompt,
-                            sampling_params,
-                            lora_request=lora_request,
-                            trace_headers=trace_headers,
-                            priority=request.priority,
-                        ))
-
                    generator = self.engine_client.generate(
-                        engine_request,
+                        engine_prompt,
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
-                        prompt_text=prompt_text,
-                        tokenization_kwargs=tokenization_kwargs,
                    )

                generators.append(generator)
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -15,11 +15,6 @@ from pydantic import BaseModel, ConfigDict, Field
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs

-from vllm.entrypoints.utils import _validate_truncation_size
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
-from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.processor import Processor
-
 if sys.version_info >= (3, 12):
    from typing import TypedDict
 else:
@ -68,7 +63,6 @@ from vllm.entrypoints.renderer import (BaseRenderer, CompletionRenderer,
 # yapf: enable
 from vllm.inputs.data import PromptType
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
-from vllm.inputs.parse import PromptComponents, get_prompt_components
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob, PromptLogprobs
 from vllm.lora.request import LoRARequest
@ -245,16 +239,6 @@ class OpenAIServing:
                                         AsyncMicrobatchTokenizer] = {}
        self.log_error_stack = log_error_stack

-    async def _get_processor(self) -> Processor:
-        if not hasattr(self, "_processor"):
-            vllm_config = await self.engine_client.get_vllm_config()
-            if self.model_config.skip_tokenizer_init:
-                tokenizer = None
-            else:
-                tokenizer = init_tokenizer_from_configs(self.model_config)
-            self._processor = Processor(vllm_config, tokenizer)
-        return self._processor
-
    def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer:
        """
        Get a Renderer instance with the provided tokenizer.
@ -866,34 +850,6 @@ class OpenAIServing:

        return conversation, [request_prompt], [engine_prompt]

-    async def _process_inputs(
-        self,
-        request_id: str,
-        engine_prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        *,
-        lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Mapping[str, str]],
-        priority: int,
-    ) -> tuple[EngineCoreRequest, dict[str, Any]]:
-        """Use the Processor to process inputs for AsyncLLM."""
-        tokenization_kwargs: dict[str, Any] = {}
-        _validate_truncation_size(self.max_model_len,
-                                  params.truncate_prompt_tokens,
-                                  tokenization_kwargs)
-
-        processor = await self._get_processor()
-        engine_request = processor.process_inputs(
-            request_id,
-            engine_prompt,
-            params,
-            lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
-            trace_headers=trace_headers,
-            priority=priority,
-        )
-        return engine_request, tokenization_kwargs
-
    async def _generate_with_builtin_tools(
        self,
        request_id: str,
@ -905,7 +861,6 @@ class OpenAIServing:
        priority: int = 0,
        **kwargs,
    ):
-        prompt_text, _, _ = self._get_prompt_components(request_prompt)
        orig_priority = priority
        while True:
            self._log_inputs(
@ -914,27 +869,14 @@ class OpenAIServing:
                params=sampling_params,
                lora_request=lora_request,
            )
-            trace_headers = kwargs.get("trace_headers")
-            engine_request, tokenization_kwargs = (await self._process_inputs(
-                request_id,
+            generator = self.engine_client.generate(
                engine_prompt,
                sampling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                priority=priority,
-            ))
-
-            generator = self.engine_client.generate(
-                engine_request,
-                sampling_params,
                request_id,
                lora_request=lora_request,
                priority=priority,
-                prompt_text=prompt_text,
-                tokenization_kwargs=tokenization_kwargs,
                **kwargs,
            )
-
            async for res in generator:
                context.append_output(res)
                # NOTE(woosuk): The stop condition is handled by the engine.
@ -963,15 +905,6 @@ class OpenAIServing:
            # OPTIMIZATION
            priority = orig_priority - 1

-    def _get_prompt_components(
-        self,
-        prompt: Union[RequestPrompt, PromptType],
-    ) -> PromptComponents:
-        if isinstance(prompt, list):
-            return PromptComponents(token_ids=prompt)
-
-        return get_prompt_components(prompt)  # type: ignore[arg-type]
-
    def _log_inputs(
        self,
        request_id: str,
@ -982,9 +915,14 @@ class OpenAIServing:
    ) -> None:
        if self.request_logger is None:
            return
-
-        prompt, prompt_token_ids, prompt_embeds = (
-            self._get_prompt_components(inputs))
+        prompt, prompt_token_ids, prompt_embeds = None, None, None
+        if isinstance(inputs, str):
+            prompt = inputs
+        elif isinstance(inputs, list):
+            prompt_token_ids = inputs
+        else:
+            prompt = getattr(inputs, 'prompt', None)
+            prompt_token_ids = getattr(inputs, 'prompt_token_ids', None)

        self.request_logger.log_inputs(
            request_id,
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -445,19 +445,6 @@ class OpenAIServingResponses(OpenAIServing):

        return messages, [prompt_token_ids], [engine_prompt]

-    async def _initialize_tool_sessions(self, request: ResponsesRequest,
-                                        context: ConversationContext,
-                                        exit_stack: AsyncExitStack):
-        # we should only initialize the tool session if the request needs tools
-        if len(request.tools) == 0:
-            return
-        mcp_tools = {
-            tool.server_label: tool
-            for tool in request.tools if tool.type == "mcp"
-        }
-        await context.init_tool_sessions(self.tool_server, exit_stack,
-                                         request.request_id, mcp_tools)
-
    async def responses_full_generator(
        self,
        request: ResponsesRequest,
@ -474,8 +461,12 @@ class OpenAIServingResponses(OpenAIServing):

        async with AsyncExitStack() as exit_stack:
            try:
-                await self._initialize_tool_sessions(request, context,
-                                                     exit_stack)
+                mcp_tools = {
+                    tool.server_label: tool
+                    for tool in request.tools if tool.type == "mcp"
+                }
+                await context.init_tool_sessions(self.tool_server, exit_stack,
+                                                 request.request_id, mcp_tools)
                async for _ in result_generator:
                    pass
            except asyncio.CancelledError:
@ -1659,10 +1650,12 @@ class OpenAIServingResponses(OpenAIServing):
        async with AsyncExitStack() as exit_stack:
            processer = None
            if self.use_harmony:
-                # TODO: in streaming, we noticed this bug:
-                # https://github.com/vllm-project/vllm/issues/25697
-                await self._initialize_tool_sessions(request, context,
-                                                     exit_stack)
+                mcp_tools = {
+                    tool.server_label: tool
+                    for tool in request.tools if tool.type == "mcp"
+                }
+                await context.init_tool_sessions(self.tool_server, exit_stack,
+                                                 request.request_id, mcp_tools)
                processer = self._process_harmony_streaming_events
            else:
                processer = self._process_simple_streaming_events
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -68,7 +68,6 @@ if TYPE_CHECKING:
    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
-    VLLM_MEDIA_URL_ALLOW_REDIRECTS: bool = True
    VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
    VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
    VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
@ -619,9 +618,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # All possible options loaded dynamically from _Backend enum
    "VLLM_ATTENTION_BACKEND":
    env_with_choices("VLLM_ATTENTION_BACKEND", None,
-                     lambda: list(__import__(
-                         'vllm.attention.backends.registry',
-                         fromlist=['_Backend'])._Backend.__members__.keys())),
+                     lambda: list(__import__('vllm.platforms.interface', \
+                        fromlist=['_Backend'])._Backend.__members__.keys())),

    # If set, vllm will use flashinfer sampler
    "VLLM_USE_FLASHINFER_SAMPLER":
@ -735,11 +733,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_AUDIO_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),

-    # Whether to allow HTTP redirects when fetching from media URLs.
-    # Default to True
-    "VLLM_MEDIA_URL_ALLOW_REDIRECTS":
-    lambda: bool(int(os.getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1"))),
-
    # Max number of workers for the thread pool handling
    # media bytes loading. Set to 1 to disable parallel processing.
    # Default is 8
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@ -205,6 +205,11 @@ class TokenInputs(TypedDict):
    prompt_token_ids: list[int]
    """The token IDs of the prompt."""

+    prompt: NotRequired[str]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
    cache_salt: NotRequired[str]
    """
    Optional cache salt to be used for prefix caching.
@ -213,12 +218,15 @@ class TokenInputs(TypedDict):

 def token_inputs(
    prompt_token_ids: list[int],
+    prompt: Optional[str] = None,
    cache_salt: Optional[str] = None,
 ) -> TokenInputs:
    """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
    values."""
    inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)

+    if prompt is not None:
+        inputs["prompt"] = prompt
    if cache_salt is not None:
        inputs["cache_salt"] = cache_salt

--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import (TYPE_CHECKING, Literal, NamedTuple, Optional, TypedDict,
-                    Union, cast, overload)
+from typing import Literal, Optional, TypedDict, Union, cast, overload

 from typing_extensions import TypeIs

@ -12,9 +11,6 @@ from .data import (EmbedsPrompt, ExplicitEncoderDecoderPrompt, ProcessorInputs,
                   PromptType, SingletonInputs, SingletonPrompt, TextPrompt,
                   TokensPrompt)

-if TYPE_CHECKING:
-    import torch
-

 class ParsedText(TypedDict):
    content: str
@ -153,23 +149,3 @@ def split_enc_dec_inputs(
        )

    return None, inputs
-
-
-class PromptComponents(NamedTuple):
-    text: Optional[str] = None
-    token_ids: Optional[list[int]] = None
-    embeds: Optional["torch.Tensor"] = None
-
-
-def get_prompt_components(prompt: PromptType) -> PromptComponents:
-    if isinstance(prompt, str):
-        return PromptComponents(text=prompt)
-
-    if (encoder_prompt := prompt.get("encoder_prompt")):
-        return get_prompt_components(encoder_prompt)  # type: ignore[arg-type]
-
-    return PromptComponents(
-        text=prompt.get("prompt"),  # type: ignore[arg-type]
-        token_ids=prompt.get("prompt_token_ids"),  # type: ignore[arg-type]
-        embeds=prompt.get("prompt_embeds"),
-    )
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@ -16,10 +16,9 @@ from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer

 from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
-                   EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
-                   ProcessorInputs, PromptType, SingletonInputs,
-                   SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
-                   embeds_inputs, token_inputs)
+                   EncoderDecoderInputs, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
+                   TokensPrompt, embeds_inputs, token_inputs)
 from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt

 logger = init_logger(__name__)
@ -323,7 +322,7 @@ class InputPreprocessor:
                mm_uuids=mm_uuids,
            )
        else:
-            inputs = token_inputs(prompt_token_ids)
+            inputs = token_inputs(prompt_token_ids=prompt_token_ids)

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt
@ -353,7 +352,10 @@ class InputPreprocessor:
                prompt_text,
                tokenization_kwargs=tokenization_kwargs,
            )
-            inputs = token_inputs(prompt_token_ids)
+            inputs = token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+            )

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt
@ -471,17 +473,22 @@ class InputPreprocessor:
        decoder_inputs: SingletonInputs

        if inputs["type"] == "multimodal":  # Multimodal data inputs
-            if "encoder_prompt_token_ids" not in inputs:
+            if not ("encoder_prompt" in inputs
+                    and "encoder_prompt_token_ids" in inputs):
                raise RuntimeError("You should register an encoder-decoder "
                                   "multi-modal processor for encoder-decoder "
                                   "models.")
            inputs = cast(MultiModalEncDecInputs, inputs)

-            encoder_inputs = token_inputs(inputs["encoder_prompt_token_ids"])
+            encoder_inputs = token_inputs(
+                prompt=inputs["encoder_prompt"],
+                prompt_token_ids=inputs["encoder_prompt_token_ids"],
+            )

            decoder_prompt_inputs = decoder_inputs_to_override or inputs
            decoder_inputs = MultiModalInputs(
                type="multimodal",
+                prompt=decoder_prompt_inputs.get("prompt", ""),
                prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
                mm_kwargs=inputs["mm_kwargs"],
                mm_hashes=inputs["mm_hashes"],
@ -491,7 +498,7 @@ class InputPreprocessor:
                decoder_inputs["cache_salt"] = cache_salt

        elif inputs["type"] == "token":  # Text-only inputs
-            encoder_inputs = token_inputs(prompt_token_ids=[])
+            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
            decoder_inputs = decoder_inputs_to_override or inputs
        else:
            assert_never(inputs)  # type: ignore[arg-type]
@ -542,14 +549,12 @@ class InputPreprocessor:
        decoder_inputs: Optional[SingletonInputs]

        if is_explicit_encoder_decoder_prompt(prompt):
-            # `cast` is needed for mypy, but not pyright
-            prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
            encoder_inputs = self._prompt_to_llm_inputs(
-                prompt_["encoder_prompt"],
+                prompt["encoder_prompt"],
                tokenization_kwargs=tokenization_kwargs,
                mm_uuids=mm_uuids,
            )
-            if (decoder_input := prompt_["decoder_prompt"]) is None:
+            if (decoder_input := prompt["decoder_prompt"]) is None:
                decoder_inputs = None
            else:
                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
@ -560,9 +565,8 @@ class InputPreprocessor:
                    self._split_enc_dec_mm_inputs(encoder_inputs,
                                                  decoder_inputs))
        else:
-            # `cast` is needed for mypy, but not pyright
            inputs = self._prompt_to_llm_inputs(
-                cast(SingletonPrompt, prompt),
+                prompt,
                tokenization_kwargs=tokenization_kwargs,
                mm_uuids=mm_uuids,
            )
@ -637,9 +641,8 @@ class InputPreprocessor:
                             "to decoder-only models")

        # Decoder-only operation
-        # `cast` is needed for mypy, but not pyright
        return self._process_decoder_only_prompt(
-            cast(SingletonPrompt, prompt),
+            prompt,
            tokenization_kwargs=tokenization_kwargs,
            mm_uuids=mm_uuids,
        )
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@ -355,7 +355,7 @@ def batched_triton_kernel(

 def invoke_moe_batched_triton_kernel(
        A: torch.Tensor,  # [E, max_tokens, K]
-        B: torch.Tensor,  # [E, N, K]
+        B: torch.Tensor,  # [E, K, N]
        C: torch.Tensor,  # [E, max_tokens, N]
        expert_num_tokens: torch.Tensor,  # [E]
        compute_type: tl.dtype,
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -4,6 +4,9 @@
 import functools
 import json
 import os
+# torch.compile needs typing.List. It will fail torch.library.infer_schema
+# otherwise
+from typing import List  # noqa: UP035
 from typing import Any, Callable, Optional, Union

 import torch
@ -1226,7 +1229,7 @@ def inplace_fused_experts(
    w2_zp: Optional[torch.Tensor] = None,
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,  #noqa: UP006
    w1_bias: Optional[torch.Tensor] = None,
    w2_bias: Optional[torch.Tensor] = None,
 ) -> None:
@ -1260,7 +1263,7 @@ def inplace_fused_experts_fake(
    w2_zp: Optional[torch.Tensor] = None,
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,  #noqa: UP006
    w1_bias: Optional[torch.Tensor] = None,
    w2_bias: Optional[torch.Tensor] = None,
 ) -> None:
@ -1299,7 +1302,7 @@ def outplace_fused_experts(
    w2_zp: Optional[torch.Tensor] = None,
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,  #noqa: UP006
    w1_bias: Optional[torch.Tensor] = None,
    w2_bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -960,7 +960,6 @@ class FusedMoE(CustomOp):
        is_sequence_parallel=False,
        zero_expert_num: Optional[int] = 0,
        zero_expert_type: Optional[str] = None,
-        expert_mapping: Optional[list[tuple[str, str, int, str]]] = None,
    ):
        super().__init__()
        if params_dtype is None:
@ -997,9 +996,6 @@ class FusedMoE(CustomOp):
        self.zero_expert_num = zero_expert_num
        self.zero_expert_type = zero_expert_type

-        # Expert mapping used in self.load_weights
-        self.expert_mapping = expert_mapping
-
        # Round up hidden size if needed.
        hidden_size = maybe_roundup_hidden_size(hidden_size, moe_in_dtype,
                                                quant_config,
@ -1621,33 +1617,6 @@ class FusedMoE(CustomOp):

        return False if return_success else None

-    def load_weights(
-            self, weights: Iterable[tuple[str,
-                                          torch.Tensor]]) -> Iterable[str]:
-        if (expert_mapping := self.expert_mapping) is None:
-            raise ValueError("`self.expert_mapping` must be provided to "
-                             "load weights using `self.load_weights`.")
-        for expert_name, loaded_weight in weights:
-            qual_name = f"{self.layer_name}.{expert_name}"
-            for param_name, weight_name, expert_id, shard_id in expert_mapping:
-                if weight_name not in qual_name:
-                    continue
-                weight_name = qual_name.replace(weight_name, param_name)
-                param_name = weight_name.removeprefix(f"{self.layer_name}.")
-                param = getattr(self, param_name)
-                success = self.weight_loader(
-                    param=param,
-                    loaded_weight=loaded_weight,
-                    weight_name=weight_name,
-                    shard_id=shard_id,
-                    expert_id=expert_id,
-                    return_success=True,
-                )
-                if success:
-                    logger.debug("Loaded %s for expert %d into %s", param_name,
-                                 expert_id, self.layer_name)
-                    yield param_name
-
    def get_expert_weights(self) -> Iterable[torch.Tensor]:
        weights = list(self.named_parameters())
        assert all(weight.is_contiguous() for _, weight in weights)
@ -1930,15 +1899,6 @@ class FusedMoE(CustomOp):
            staged_hidden_states.copy_(hidden_states, non_blocking=True)
            staged_router_logits.copy_(router_logits, non_blocking=True)

-            # If there are shared experts but we are not using a modular kernel,
-            # the shared experts must be called here
-            if (not isinstance(self.quant_method.fused_experts,
-                               FusedMoEModularKernel)
-                    and self.shared_experts is not None):
-                shared_output = self.shared_experts(staged_hidden_states)
-            else:
-                shared_output = None
-
            # Matrix multiply.
            final_hidden_states = self.quant_method.apply(
                layer=self,
@ -1962,13 +1922,8 @@ class FusedMoE(CustomOp):
                logical_replica_count=self.logical_replica_count,
            )

-            if shared_output is not None:
-                assert not isinstance(final_hidden_states, tuple)
-                assert self.shared_experts is not None
-                final_hidden_states = (
-                    shared_output,
-                    final_hidden_states,
-                )
+            assert self.shared_experts is None or isinstance(
+                final_hidden_states, tuple)

            if self.zero_expert_num is not None and self.zero_expert_num > 0:
                assert isinstance(final_hidden_states, tuple)
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@ -115,7 +115,7 @@ class ShortConv(MambaBase, CustomOp):
            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
            conv_state = self_kv_cache[0].transpose(-1, -2)
            state_indices_tensor = attn_metadata.state_indices_tensor
-            has_initial_states_p = attn_metadata.has_initial_states_p
+            has_initial_states_p = attn_metadata.has_initial_states

        BCx, _ = self.in_proj(hidden_states)

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@ -644,14 +644,6 @@ class CompressedTensorsConfig(QuantizationConfig):
        # If no matches, return None
        return None

-    def has_blocked_weights(self) -> bool:
-        for scheme in self.target_scheme_map.values():
-            weight_quant = scheme.get("weights")
-            if (weight_quant is not None
-                    and weight_quant.strategy == QuantizationStrategy.BLOCK):
-                return True
-        return False
-
    @staticmethod
    def supports_cutlass_24(
            weight_quant: Optional[QuantizationArgs],
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@ -11,7 +11,7 @@ from torch.nn import Parameter
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
    CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support,
+    apply_fp8_block_linear, check_aiter_fp8_linear_support,
    create_fp8_input_scale, create_fp8_scale_parameter,
    create_fp8_weight_parameter, maybe_post_process_fp8_weight_block,
    process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy,
@ -41,30 +41,16 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
        self.strategy = weight_quant.strategy
        self.out_dtype = torch.get_default_dtype()
        self.is_static_input_scheme = is_static_input_scheme
+        self.act_q_group_shape = GroupShape.PER_TENSOR \
+            if is_static_input_scheme else GroupShape.PER_TOKEN
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.is_static_input_scheme,
+            act_quant_group_shape=self.act_q_group_shape)

        self.weight_block_size = self.weight_quant.block_structure
-        if self.weight_block_size is not None:
-            self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
-        else:
-            self.act_q_group_shape = GroupShape.PER_TENSOR \
-                if is_static_input_scheme else GroupShape.PER_TOKEN
-
        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()

-        if self.weight_block_size is not None:
-            assert not self.is_static_input_scheme
-            self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
-                weight_group_shape=GroupShape(*self.weight_block_size),
-                act_quant_group_shape=self.act_q_group_shape,
-                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
-                use_aiter_and_is_supported=self.use_aiter_and_is_supported,
-            )
-        else:
-            self.fp8_linear = Fp8LinearOp(
-                act_quant_static=self.is_static_input_scheme,
-                act_quant_group_shape=self.act_q_group_shape)
-
    @classmethod
    def get_min_capability(cls) -> int:
        # lovelace and up
@ -156,14 +142,13 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
                      x: torch.Tensor,
                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:

-        if self.weight_block_size is not None:
-            return self.w8a8_block_fp8_linear.apply(
+        if layer.weight_block_size is not None:
+            return apply_fp8_block_linear(
+                layer,
                input=x,
-                weight=layer.weight,
-                weight_scale=layer.weight_scale,
-                input_scale=layer.input_scale,
                bias=bias,
-            )
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported)

        return self.fp8_linear.apply(input=x,
                                     weight=layer.weight,
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
    select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support,
+    apply_fp8_block_linear, check_aiter_fp8_linear_support,
    create_fp8_input_scale, create_fp8_scale_parameter,
    create_fp8_weight_parameter, expert_weight_is_col_major,
    maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy,
@ -242,28 +242,15 @@ class Fp8LinearMethod(LinearMethodBase):
        self.weight_block_size = self.quant_config.weight_block_size
        self.block_quant = self.weight_block_size is not None
        self.act_q_static = self.quant_config.activation_scheme == "static"
-        if self.weight_block_size:
-            self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
+        # Use per-token quantization for better perf if dynamic and cutlass
+        if not self.act_q_static and cutlass_fp8_supported():
+            self.act_q_group_shape = GroupShape.PER_TOKEN
        else:
-            # Use per-token quantization for better perf if dynamic and cutlass
-            if not self.act_q_static and cutlass_fp8_supported():
-                self.act_q_group_shape = GroupShape.PER_TOKEN
-            else:
-                self.act_q_group_shape = GroupShape.PER_TENSOR
+            self.act_q_group_shape = GroupShape.PER_TENSOR

-        if self.block_quant:
-            assert not self.act_q_static
-            assert self.weight_block_size is not None
-            self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
-                weight_group_shape=GroupShape(*self.weight_block_size),
-                act_quant_group_shape=self.act_q_group_shape,
-                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
-                use_aiter_and_is_supported=self.use_aiter_and_is_supported,
-            )
-        else:
-            self.fp8_linear = Fp8LinearOp(
-                act_quant_static=self.act_q_static,
-                act_quant_group_shape=self.act_q_group_shape)
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.act_q_static,
+            act_quant_group_shape=self.act_q_group_shape)

    def create_weights(
        self,
@ -412,15 +399,12 @@ class Fp8LinearMethod(LinearMethodBase):
                bias=bias)

        if self.block_quant:
-            assert self.weight_block_size is not None
-
-            return self.w8a8_block_fp8_linear.apply(
+            return apply_fp8_block_linear(
+                layer,
                input=x,
-                weight=layer.weight,
-                weight_scale=layer.weight_scale,
-                input_scale=layer.input_scale,
                bias=bias,
-            )
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported)

        return self.fp8_linear.apply(input=x,
                                     weight=layer.weight,
--- a/Show More
+++ b/Show More