Merge branch 'main' into fix_hang

[Bugfix][WideEP] Apply TP Attn + EP MoE fix to other models (#24982 )
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-20 23:03:52 +08:00 · 2025-09-27 10:44:26 -04:00 · 2025-09-27 14:22:28 +00:00 · 2025-09-27 07:13:11 -07:00 · 2025-09-27 12:18:20 +00:00 · 2025-09-27 10:46:49 +00:00
185 changed files with 4389 additions and 2301 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -44,7 +44,6 @@ docker run \
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
    pytest -v -s v1/test_utils.py
    pytest -v -s v1/test_metrics_reader.py
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -159,10 +159,7 @@ steps:
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
+  - tests/v1/distributed
  - tests/v1/test_external_lb_dp.py
  - tests/v1/test_internal_lb_dp.py
  - tests/v1/test_hybrid_lb_dp.py
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
@ -180,10 +177,10 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
@ -300,12 +297,9 @@ steps:
    - pytest -v -s v1/spec_decode
    - pytest -v -s v1/kv_connector/unit
    - pytest -v -s v1/metrics
    - pytest -v -s v1/test_kv_sharing.py
    - pytest -v -s v1/test_metrics_reader.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -465,29 +459,18 @@ steps:
  commands:
    - pytest -v -s kernels/mamba
- label: Tensorizer Test # 14min
+- label: Model Executor Test # 23min
-  timeout_in_minutes: 25
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/model_loader
  - tests/tensorizer_loader
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 - label: Model Executor Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
@ -522,7 +505,7 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -830,6 +813,23 @@ steps:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/models/llama4.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization/compressed_tensors
  - vllm/model_executor/layers/quantization/modelopt.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -889,14 +889,13 @@ steps:
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
+  - tests/v1/distributed
  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -12,8 +12,6 @@
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@ -28,11 +26,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
@ -54,7 +54,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
-/tests/v1/kv_connector/nixl_integration @NickLucche 
+/tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -274,7 +274,7 @@ pull_request_rules:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
-      - files~=^tests/tensorizer_loader/
+      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
      users:
--- a/README.md
+++ b/README.md
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 *Latest News* 🔥
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -0,0 +1,16 @@
 #pragma once
 #include <cstdlib>
 #include <string>
 #include <cctype>
 namespace vllm {
 // vllm_kernel_override_batch_invariant(); returns true
 // if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
 inline bool vllm_kernel_override_batch_invariant() {
  std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
  const char* val = std::getenv(env_key.c_str());
  return (val && std::atoi(val) != 0) ? 1 : 0;
 }
 }  // namespace vllm
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -1,6 +1,7 @@
 #include "type_convert.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -413,7 +414,9 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
+  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
@ -459,7 +462,8 @@ void poly_norm(torch::Tensor& out,     // [..., hidden_size]
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  if (ptrs_are_aligned && hidden_size % 8 == 0) {
+  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
    LAUNCH_FUSED_POLY_NORM(0);
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -9,6 +9,7 @@
 #include "quantization/fp8/common.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -240,7 +241,9 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
+  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -21,6 +21,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"
 #include "../core/batch_invariant.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
-    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+    const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
    const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -391,19 +391,32 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
        ${FLASHINFER_GIT_REPO} flashinfer
    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
    if [[ "${CUDA_VERSION}" == 11.* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
    else
        # CUDA 12.8+ supports 10.0a and 12.0
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
    fi
    pushd flashinfer
-        if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
+        if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
-            # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+            # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
-            # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+            echo "🏗️  Installing FlashInfer from pre-compiled wheel"
-            if [[ "${CUDA_VERSION}" == 11.* ]]; then
+            uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-            elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+            if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+                # Download pre-compiled cubins
-            else
+                TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                # CUDA 12.8+ supports 10.0a and 12.0
+                    python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
            fi
        elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
            # Build AOT kernels
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                python3 -m flashinfer.aot
@ -533,7 +546,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="0.46.1"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3]>=0.14.0'
 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@ -2,6 +2,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 - [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
 - [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
 - [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -66,35 +66,12 @@ Further update the model as follows:
 !!! important
    The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
+!!! note
    By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in
    [PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
    This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings].
-    ??? code
+    You may override this method if additional logic is required for your model when merging embeddings. 
        ```python
        from .utils import merge_multimodal_embeddings
        class YourModelForImage2Seq(nn.Module):
            ...
            def get_input_embeddings(
                self,
                input_ids: torch.Tensor,
                multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
            ) -> torch.Tensor:
                # `get_input_embeddings` should already be implemented for the language 
                # model as one of the requirements of basic vLLM model implementation.
                inputs_embeds = self.language_model.get_input_embeddings(input_ids)
                if multimodal_embeddings is not None:
                    inputs_embeds = merge_multimodal_embeddings(
                        input_ids=input_ids, 
                        inputs_embeds=inputs_embeds, 
                        multimodal_embeddings=multimodal_embeddings,
                        placeholder_token_id=self.config.image_token_index)
                return inputs_embeds
        ```
 - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -6,6 +6,10 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup
    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
    and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
 !!! tip
    When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
    This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
 ## Offline Inference
 To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@ -60,6 +60,12 @@ Key points from the PyTorch security guide:
 - Implement proper authentication and authorization for management interfaces
 - Follow the principle of least privilege for all system components
 ### 4. **Restrict Domains Access for Media URLs:**
 Restrict domains that vLLM can access for media URLs by setting
 `--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
 (e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
 ## Security and Firewalls: Protecting Exposed vLLM Systems
 While vLLM is designed to allow unsafe network services to be isolated to
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -54,6 +54,7 @@ def parse_args():
        "--method",
        type=str,
        default="eagle",
        choices=["ngram", "eagle", "eagle3", "mtp"],
    )
    parser.add_argument("--num-spec-tokens", type=int, default=2)
    parser.add_argument("--prompt-lookup-max", type=int, default=5)
@ -118,9 +119,9 @@ def main(args):
            "prompt_lookup_max": args.prompt_lookup_max,
            "prompt_lookup_min": args.prompt_lookup_min,
        }
-    elif args.method.endswith("mtp"):
+    elif args.method == "mtp":
        speculative_config = {
-            "method": args.method,
+            "method": "mtp",
            "num_speculative_tokens": args.num_spec_tokens,
        }
    else:
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@ -38,11 +38,13 @@ client = OpenAI(
    base_url=openai_api_base,
 )
 headers = {"User-Agent": "vLLM Example Client"}
 def encode_base64_content_from_url(content_url: str) -> str:
    """Encode a content retrieved from a remote url to base64 format."""
-    with requests.get(content_url) as response:
+    with requests.get(content_url, headers=headers) as response:
        response.raise_for_status()
        result = base64.b64encode(response.content).decode("utf-8")
@ -50,19 +52,19 @@ def encode_base64_content_from_url(content_url: str) -> str:
 # Text-only inference
-def run_text_only(model: str) -> None:
+def run_text_only(model: str, max_completion_tokens: int) -> None:
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": "What's the capital of France?"}],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion.choices[0].message.content
-    print("Chat completion output:", result)
+    print("Chat completion output:\n", result)
 # Single-image input inference
-def run_single_image(model: str) -> None:
+def run_single_image(model: str, max_completion_tokens: int) -> None:
    ## Use image url in the payload
    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    chat_completion_from_url = client.chat.completions.create(
@ -79,11 +81,11 @@ def run_single_image(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from image url:", result)
+    print("Chat completion output from image url:\n", result)
    ## Use base64 encoded image in the payload
    image_base64 = encode_base64_content_from_url(image_url)
@ -101,7 +103,7 @@ def run_single_image(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_base64.choices[0].message.content
@ -109,7 +111,7 @@ def run_single_image(model: str) -> None:
 # Multi-image input inference
-def run_multi_image(model: str) -> None:
+def run_multi_image(model: str, max_completion_tokens: int) -> None:
    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
    chat_completion_from_url = client.chat.completions.create(
@ -130,15 +132,15 @@ def run_multi_image(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output:", result)
+    print("Chat completion output:\n", result)
 # Video input inference
-def run_video(model: str) -> None:
+def run_video(model: str, max_completion_tokens: int) -> None:
    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
    video_base64 = encode_base64_content_from_url(video_url)
@ -157,11 +159,11 @@ def run_video(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from image url:", result)
+    print("Chat completion output from video url:\n", result)
    ## Use base64 encoded video in the payload
    chat_completion_from_base64 = client.chat.completions.create(
@ -178,15 +180,15 @@ def run_video(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from base64 encoded image:", result)
+    print("Chat completion output from base64 encoded video:\n", result)
 # Audio input inference
-def run_audio(model: str) -> None:
+def run_audio(model: str, max_completion_tokens: int) -> None:
    from vllm.assets.audio import AudioAsset
    audio_url = AudioAsset("winning_call").url
@ -211,11 +213,11 @@ def run_audio(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
+    print("Chat completion output from input audio:\n", result)
    # HTTP URL
    chat_completion_from_url = client.chat.completions.create(
@ -235,11 +237,11 @@ def run_audio(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from audio url:", result)
+    print("Chat completion output from audio url:\n", result)
    # base64 URL
    chat_completion_from_base64 = client.chat.completions.create(
@ -259,14 +261,14 @@ def run_audio(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from base64 encoded audio:", result)
+    print("Chat completion output from base64 encoded audio:\n", result)
-def run_multi_audio(model: str) -> None:
+def run_multi_audio(model: str, max_completion_tokens: int) -> None:
    from vllm.assets.audio import AudioAsset
    # Two different audios to showcase batched inference.
@ -300,11 +302,11 @@ def run_multi_audio(model: str) -> None:
            }
        ],
        model=model,
-        max_completion_tokens=64,
+        max_completion_tokens=max_completion_tokens,
    )
    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
+    print("Chat completion output from input audio:\n", result)
 example_function_map = {
@ -330,13 +332,20 @@ def parse_args():
        choices=list(example_function_map.keys()),
        help="Conversation type with multimodal data.",
    )
    parser.add_argument(
        "--max-completion-tokens",
        "-n",
        type=int,
        default=128,
        help="Maximum number of tokens to generate for each completion.",
    )
    return parser.parse_args()
 def main(args) -> None:
    chat_type = args.chat_type
    model = get_first_model(client)
-    example_function_map[chat_type](model)
+    example_function_map[chat_type](model, args.max_completion_tokens)
 if __name__ == "__main__":
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@ -43,7 +43,6 @@ tritonclient==2.51.0
 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 numba == 0.61.2; python_version > '3.9'
 numpy
-runai-model-streamer==0.11.0
+runai-model-streamer[s3]==0.14.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@ -5,8 +5,6 @@ numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Req
 numba == 0.61.2; python_version > '3.9'
 # Dependencies for AMD GPUs
 boto3
 botocore
 datasets
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 peft
@ -15,7 +13,6 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer==0.11.0
+runai-model-streamer[s3]==0.14.0
 runai-model-streamer-s3==0.11.0
 conch-triton-kernels==1.2.1
 timm>=1.0.17
--- a/requirements/test.in
+++ b/requirements/test.in
@ -51,8 +51,7 @@ tritonclient==2.51.0
 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 numba == 0.61.2; python_version > '3.9'
 numpy
-runai-model-streamer==0.11.0
+runai-model-streamer[s3]==0.14.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
 decord==0.6.0
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -72,7 +72,9 @@ blobfile==3.0.0
 bm25s==0.2.13
    # via mteb
 boto3==1.35.57
-    # via tensorizer
+    # via
    #   runai-model-streamer-s3
    #   tensorizer
 botocore==1.35.57
    # via
    #   boto3
@ -925,10 +927,10 @@ rsa==4.9.1
    # via google-auth
 rtree==1.4.0
    # via torchgeo
-runai-model-streamer==0.11.0
+runai-model-streamer==0.14.0
    # via -r requirements/test.in
 runai-model-streamer-s3==0.11.0
    # via -r requirements/test.in
 runai-model-streamer-s3==0.14.0
    # via runai-model-streamer
 s3transfer==0.10.3
    # via boto3
 sacrebleu==2.4.3
--- a/setup.py
+++ b/setup.py
@ -654,10 +654,7 @@ setup(
        "bench": ["pandas", "datasets"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai": [
+        "runai": ["runai-model-streamer[s3,gcs] >= 0.14.0"],
            "runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
            "google-cloud-storage", "runai-model-streamer-s3", "boto3"
        ],
        "audio": ["librosa", "soundfile",
                  "mistral_common[audio]"],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@ -3,12 +3,11 @@
 import contextlib
 import os
 import weakref
 from dataclasses import dataclass
 from typing import Optional
 import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
@ -33,89 +32,6 @@ def temporary_environ(env_vars):
                os.environ[k] = v
@dataclass
 class BackendConfig:
    name: str
    env_vars: dict
    comp_config: dict
    specific_gpu_arch: Optional[tuple] = None
 # Define all backend configurations of full cudagraph to be tested
 backend_configs = {
    # FA3 on Hopper
    "FA3":
    BackendConfig(name="FA3",
                  env_vars={
                      "VLLM_FLASH_ATTN_VERSION": "3",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL",
                  },
                  specific_gpu_arch=(9, 0)),
    # FlashMLA on Hopper
    "FlashMLA":
    BackendConfig(name="FlashMLA",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  },
                  specific_gpu_arch=(9, 0)),
    # FlashAttention MLA on Hopper
    "FlashAttentionMLA":
    BackendConfig(name="FlashAttentionMLA",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_DECODE_ONLY",
                  },
                  specific_gpu_arch=(9, 0)),
    # Cutlass MLA on Blackwell
    "CutlassMLA":
    BackendConfig(
        name="CutlassMLA",
        env_vars={
            "VLLM_USE_V1": "1",
            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
            "FORCE_NUM_KV_SPLITS":
            "1",  # TODO: remove this when hang issue is fixed
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
            "cudagraph_capture_sizes": [16, 32, 64, 128, 256, 512],
        },
        specific_gpu_arch=(10, 0)),
    # FA2
    "FA2":
    BackendConfig(name="FA2",
                  env_vars={
                      "VLLM_FLASH_ATTN_VERSION": "2",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL",
                  }),
    # Triton Attention
    "TritonAttn":
    BackendConfig(name="TritonAttn",
                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
                  comp_config={
                      "cudagraph_mode": "FULL",
                  }),
    # FlashInfer
    "FlashInfer":
    BackendConfig(name="FlashInfer",
                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
 }
 test_params_full_cudagraph = []
 # deepseek-ai/DeepSeek-V2-Lite with MLA
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -4,7 +4,7 @@ import pytest
 import vllm
 from vllm.compilation.counter import compilation_counter
-from vllm.config import CompilationConfig, VllmConfig
+from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.utils import _is_torch_equal_or_newer
@ -106,7 +106,6 @@ def test_dynamo_as_is(vllm_runner, monkeypatch):
 def test_no_compilation(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
    with (
            compilation_counter.expect(num_graphs_seen=0,
                                       dynamo_as_is_count=0),
@ -131,3 +130,67 @@ def test_enforce_eager(vllm_runner, monkeypatch):
                        enforce_eager=True,
                        gpu_memory_utilization=0.4) as _):
        pass
 def test_splitting_ops_dynamic():
    # Default config
    config = VllmConfig()
    assert config.compilation_config.cudagraph_mode == \
        CUDAGraphMode.FULL_AND_PIECEWISE
    assert config.compilation_config.splitting_ops_contain_attention()
    # When use_inductor_graph_partition=True
    if _is_torch_equal_or_newer('2.9.0.dev'):
        # inductor graph partition is only available in PyTorch 2.9+.
        # this is a fast config check so we are not using pytest.skip.
        config = VllmConfig(compilation_config=CompilationConfig(
            use_inductor_graph_partition=True,
            splitting_ops=["silly_attention"]))
        # should ignore splitting_ops
        assert config.compilation_config.splitting_ops == []
    # When attn_fusion pass enabled.
    config = VllmConfig(compilation_config=CompilationConfig(
        pass_config={
            "enable_attn_fusion": True,
            "enable_noop": True
        },
        custom_ops=["+quant_fp8"],
        cudagraph_mode=CUDAGraphMode.PIECEWISE,
    ))
    assert config.compilation_config.splitting_ops == []
    # cudagraph mode also fall back to FULL
    assert config.compilation_config.cudagraph_mode == \
        CUDAGraphMode.FULL
    # splitting_ops can not contain attention ops when attn_fusion
    # pass enabled.
    with pytest.raises(AssertionError):
        config = VllmConfig(compilation_config=CompilationConfig(
            pass_config={
                "enable_attn_fusion": True,
                "enable_noop": True
            },
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
            # work around for accessing all attntion ops
            splitting_ops=CompilationConfig()._attention_ops,
        ))
    # When both use_inductor_graph_partition and attn_fusion pass enabled.
    if _is_torch_equal_or_newer('2.9.0.dev'):
        config = VllmConfig(compilation_config=CompilationConfig(
            use_inductor_graph_partition=True,
            pass_config={
                "enable_attn_fusion": True,
                "enable_noop": True
            },
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
        ))
        assert config.compilation_config.splitting_ops == []
        # enable_attn_fusion is directly support under
        # use_inductor_graph_partition=True, and cudagraph_mode
        # is unchanged.
        assert config.compilation_config.cudagraph_mode == \
            CUDAGraphMode.PIECEWISE
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -45,6 +45,7 @@ class MockModelConfig:
    logits_processor_pattern: Optional[str] = None
    diff_sampling_param: Optional[dict] = None
    allowed_local_media_path: str = ""
    allowed_media_domains: Optional[list[str]] = None
    encoder_config = None
    generation_config: str = "auto"
    skip_tokenizer_init: bool = False
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -240,6 +240,7 @@ class MockModelConfig:
    logits_processor_pattern = None
    diff_sampling_param: Optional[dict] = None
    allowed_local_media_path: str = ""
    allowed_media_domains: Optional[list[str]] = None
    encoder_config = None
    generation_config: str = "auto"
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -19,6 +19,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                         parse_chat_messages,
                                         parse_chat_messages_futures,
                                         resolve_chat_template_content_format,
                                         resolve_chat_template_kwargs,
                                         resolve_hf_chat_template)
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
@ -37,6 +38,7 @@ QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
 QWEN3_MODEL_ID = "Qwen/Qwen3-8B"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
 HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
 MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@ -2255,6 +2257,89 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
    assert isinstance(chat_template, str)
@pytest.mark.parametrize(
    "model, expected_kwargs",
    [
        (
            QWEN2VL_MODEL_ID,
            {
                "add_vision_id", "add_generation_prompt",
                "continue_final_message", "tools"
            },
        ),
        (
            QWEN3_MODEL_ID,
            {
                "enable_thinking", "add_generation_prompt",
                "continue_final_message", "tools"
            },
        ),
    ],
 )
 def test_resolve_hf_chat_template_kwargs(sample_json_schema, model,
                                         expected_kwargs):
    """checks that chat_template is a dict type for HF models."""
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    tools = ([{
        "type": "function",
        "function": {
            "name": "dummy_function_name",
            "description": "This is a dummy function",
            "parameters": sample_json_schema,
        },
    }])
    chat_template_kwargs = {
        # both unused
        "unsed_kwargs_1": 123,
        "unsed_kwargs_2": "abc",
        # should not appear
        "chat_template": "{% Hello world! %}",
        # used by tokenizer
        "continue_final_message": True,
        "tools": tools,
        # both used by Qwen2-VL and Qwen3
        "add_generation_prompt": True,
        # only used by Qwen2-VL
        "add_vision_id": True,
        # only used by Qwen3
        "enable_thinking": True,
    }
    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
        skip_tokenizer_init=model_info.skip_tokenizer_init,
        enforce_eager=model_info.enforce_eager,
        dtype=model_info.dtype)
    # Build the tokenizer
    tokenizer = get_tokenizer(
        model,
        trust_remote_code=model_config.trust_remote_code,
    )
    # Test detecting the tokenizer's chat_template
    chat_template = resolve_hf_chat_template(
        tokenizer,
        chat_template=None,
        tools=tools,
        model_config=model_config,
    )
    resolved_chat_template_kwargs = resolve_chat_template_kwargs(
        tokenizer,
        chat_template=chat_template,
        chat_template_kwargs=chat_template_kwargs,
    )
    assert set(resolved_chat_template_kwargs.keys()) == expected_kwargs
 # NOTE: Qwen2-Audio default chat template is specially defined inside
 # processor class instead of using `tokenizer_config.json`
 # yapf: disable
--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
@ -1,52 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
@pytest.fixture
 def sample_regex():
    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
@pytest.fixture
 def sample_json_schema():
    return {
        "type": "object",
        "properties": {
            "name": {
                "type": "string"
            },
            "age": {
                "type": "integer"
            },
            "skills": {
                "type": "array",
                "items": {
                    "type": "string",
                    "maxLength": 10
                },
                "minItems": 3
            },
            "work_history": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "company": {
                            "type": "string"
                        },
                        "duration": {
                            "type": "number"
                        },
                        "position": {
                            "type": "string"
                        }
                    },
                    "required": ["company", "position"]
                }
            }
        },
        "required": ["name", "age", "skills", "work_history"]
    }
--- a/tests/model_executor/model_loader/fastsafetensors_loader/init.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/init.py
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
--- a/tests/model_executor/model_loader/runai_model_streamer/init.py
+++ b/tests/model_executor/model_loader/runai_model_streamer/init.py
--- a/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
+++ b/tests/model_executor/model_loader/runai_model_streamer/test_runai_model_streamer_loader.py
--- a/tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py
+++ b/tests/model_executor/model_loader/runai_model_streamer/test_runai_utils.py
--- a/tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py
+++ b/tests/model_executor/model_loader/runai_model_streamer/test_weight_utils.py
--- a/tests/model_executor/model_loader/tensorizer_loader/init.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/init.py
--- a/tests/model_executor/model_loader/tensorizer_loader/conftest.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
--- a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
@ -14,6 +14,7 @@ import pytest
 import torch
 import vllm.model_executor.model_loader.tensorizer
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 # yapf: disable
@ -27,7 +28,6 @@ from vllm.model_executor.model_loader.tensorizer_loader import (
 # yapf: enable
 from vllm.utils import PlaceholderModule
 from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import DummyExecutor, assert_from_collective_rpc
 try:
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -651,6 +651,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                    trust_remote_code=True,
                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
    "Eagle3Qwen2_5vlForCausalLM": _HfExamplesInfo(
        "Qwen/Qwen2.5-VL-7B-Instruct",
        speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl"),
    "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
                                     min_transformers_version="4.56.3"),
 }
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@ -100,10 +100,9 @@ def test_distributed(
                         kwargs_test=kwargs)
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="bitsandbytes quantization is currently not supported in rocm.")
@pytest.mark.parametrize("model, quantization_kwargs", [
    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
    (
        "meta-llama/Llama-3.2-1B-Instruct",
        {
@ -121,6 +120,11 @@ def test_quantization(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    if (current_platform.is_rocm()
            and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
        pytest.skip(
            "bitsandbytes quantization is currently not supported in rocm.")
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@ -66,7 +66,12 @@ async def test_fetch_image_http(image_url: str):
@pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: dict[str, Image.Image],
                                  raw_image_url: str, suffix: str):
-    connector = MediaConnector()
+    connector = MediaConnector(
        # Domain restriction should not apply to data URLs.
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
        ])
    url_image = url_images[raw_image_url]
    try:
@ -387,3 +392,29 @@ def test_argsort_mm_positions(case):
    modality_idxs = argsort_mm_positions(mm_positions)
    assert modality_idxs == expected_modality_idxs
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
 async def test_allowed_media_domains(video_url: str, num_frames: int):
    connector = MediaConnector(
        media_io_kwargs={"video": {
            "num_frames": num_frames,
        }},
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
        ])
    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
    with pytest.raises(ValueError):
        _, _ = connector.fetch_video(disallowed_url)
    with pytest.raises(ValueError):
        _, _ = await connector.fetch_video_async(disallowed_url)
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@ -0,0 +1,132 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
 import pytest
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 if not current_platform.is_device_capability(100):
    pytest.skip("This test only runs on Blackwell GPUs (SM100).",
                allow_module_level=True)
 os.environ["FLASHINFER_NVCC_THREADS"] = "16"
 # dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
 # "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
 dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
 def can_initialize(model: str, extra_args: list[str]):
    # Server arguments
    server_args = [
        "--max-model-len",
        "2048",
        "--max-num-batched-tokens",
        "256",
        "--load-format",
        "dummy",
        "--trust-remote-code",
        "--limit-mm-per-prompt",
        json.dumps({"image": 0}),
        *extra_args,
    ]
    # Launch server and make a simple request
    with RemoteOpenAIServer(
            model,
            server_args,
            max_wait_seconds=1000,  # Due to FlashInfer compile
            override_hf_configs=dummy_hf_overrides) as server:
        client = server.get_client()
        # Make a simple request to verify the server works
        completion = client.completions.create(
            model=model,
            prompt=["Hello, World!"],
            temperature=0,
            max_tokens=2,
        )
        print(completion)
        assert completion.choices[0].text is not None
 ## Llama4 ##
@pytest.mark.skip(reason=(
    "RuntimeError: run_moe() Expected a value of type "
    "'Optional[List[Tensor]]' for argument '_9' but instead found type "
    "'list'."))
 def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
@pytest.mark.skip(reason="Works, but takes too long to run")
 def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
@pytest.mark.skip(reason="Works, but takes too long to run")
 def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
 def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
 ## DeepSeekV3 ##
 def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
    can_initialize("deepseek-ai/DeepSeek-V3.1", [])
 def test_deepseek_nvfp4_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
 def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
 ## GPT-OSS ##
 def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
    can_initialize("openai/gpt-oss-20b", [])
 def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
    can_initialize("openai/gpt-oss-20b", [])
 def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
    can_initialize("openai/gpt-oss-20b", [])
--- a/tests/reasoning/test_glm4_moe_reasoning_parser.py
+++ b/tests/reasoning/test_glm4_moe_reasoning_parser.py
@ -0,0 +1,203 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from transformers import AutoTokenizer
 from tests.reasoning.utils import run_reasoning_extraction
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 parser_name = "glm45"
 start_token = "<think>"
 end_token = "</think>"
 REASONING_MODEL_NAME = "zai-org/GLM-4.5"
@pytest.fixture(scope="module")
 def glm45_tokenizer():
    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
 WITH_THINK = {
    "output": "<think>This is a reasoning section</think>This is the rest",
    "reasoning_content": "This is a reasoning section",
    "content": "This is the rest",
    "is_reasoning_end": True,
 }
 WITH_THINK_STREAM = {
    "output": "<think>This is a reasoning section</think>This is the rest",
    "reasoning_content": "This is a reasoning section",
    "content": "This is the rest",
    "is_reasoning_end": True,
 }
 WITHOUT_THINK = {
    "output": "This is the rest",
    "reasoning_content": None,
    "content": "This is the rest",
    "is_reasoning_end": False,
 }
 WITHOUT_THINK_STREAM = {
    "output": "This is the rest",
    "reasoning_content": None,
    "content": "This is the rest",
    "is_reasoning_end": False,
 }
 COMPLETE_REASONING = {
    "output": "<think>This is a reasoning section</think>",
    "reasoning_content": "This is a reasoning section",
    "content": None,
    "is_reasoning_end": True,
 }
 MULTILINE_REASONING = {
    "output":
    "<think>This is a reasoning\nsection</think>This is the rest\nThat",
    "reasoning_content": "This is a reasoning\nsection",
    "content": "This is the rest\nThat",
    "is_reasoning_end": True,
 }
 ONLY_OPEN_TAG = {
    "output": "<think>This is a reasoning section",
    "reasoning_content": None,
    "content": "<think>This is a reasoning section",
    "is_reasoning_end": False,
 }
 ONLY_OPEN_TAG_STREAM = {
    "output": "<think>This is a reasoning section",
    "reasoning_content": "This is a reasoning section",
    "content": None,
    "is_reasoning_end": False,
 }
 TEST_CASES = [
    pytest.param(
        False,
        WITH_THINK,
        id="with_think",
    ),
    pytest.param(
        True,
        WITH_THINK_STREAM,
        id="with_think_stream",
    ),
    pytest.param(
        False,
        WITHOUT_THINK,
        id="without_think",
    ),
    pytest.param(
        True,
        WITHOUT_THINK_STREAM,
        id="without_think_stream",
    ),
    pytest.param(
        False,
        COMPLETE_REASONING,
        id="complete_reasoning",
    ),
    pytest.param(
        True,
        COMPLETE_REASONING,
        id="complete_reasoning_stream",
    ),
    pytest.param(
        False,
        MULTILINE_REASONING,
        id="multiline_reasoning",
    ),
    pytest.param(
        True,
        MULTILINE_REASONING,
        id="multiline_reasoning_stream",
    ),
    pytest.param(
        False,
        ONLY_OPEN_TAG,
        id="only_open_tag",
    ),
    pytest.param(
        True,
        ONLY_OPEN_TAG_STREAM,
        id="only_open_tag_stream",
    ),
 ]
 STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
 You are a helpful assistant.<|user|>
 What is the capital of France?<|assistant|>
 <think>The user is asking for the capital of"""
 DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
 You are a helpful assistant.<|user|>
 What is the capital of France?<|assistant|>
 <think>The user is asking for the capital of France.</think>
 The capital of France is Paris."""
 MULTI_TURN_STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
 You are a helpful assistant.<|user|>
 What is the capital of France?<|assistant|>
 <think></think>
 The capital of France is Paris.<|user|>
 What about Chile?<|assistant|>
 <think>The user is asking for the capital of"""
 MULTI_TURN_DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
 You are a helpful assistant.<|user|>
 What is the capital of France?<|assistant|>
 <think></think>
 The capital of France is Paris.<|user|>
 What about Chile?<|assistant|>
 <think>The user is asking for the capital of Chile.</think>
 The capital of Chile is Santiago."""
 REASONING_END_TEST_CASES = [
    pytest.param(STILL_REASONING_PROMPT, False, id="still_reasoning"),
    pytest.param(DONE_REASONING_PROMPT, True, id="done_reasoning"),
    pytest.param(MULTI_TURN_STILL_REASONING_PROMPT,
                 False,
                 id="multi_turn_still_reasoning"),
    pytest.param(MULTI_TURN_DONE_REASONING_PROMPT,
                 True,
                 id="multi_turn_done_reasoning")
 ]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
    streaming: bool,
    param_dict: dict,
    glm45_tokenizer,
 ):
    output = glm45_tokenizer.tokenize(param_dict["output"])
    output_tokens: list[str] = [
        glm45_tokenizer.convert_tokens_to_string([token]) for token in output
    ]
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
        parser_name)(glm45_tokenizer)
    reasoning, content = run_reasoning_extraction(parser,
                                                  output_tokens,
                                                  streaming=streaming)
    assert reasoning == param_dict["reasoning_content"]
    assert content == param_dict["content"]
    output_ids = glm45_tokenizer.convert_tokens_to_ids(output)
    is_reasoning_end = parser.is_reasoning_end(output_ids)
    assert is_reasoning_end == param_dict["is_reasoning_end"]
@pytest.mark.parametrize("prompt, is_reasoning_end", REASONING_END_TEST_CASES)
 def test_is_reasoning_end_full_prompt(prompt: str, is_reasoning_end: bool,
                                      glm45_tokenizer):
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
        parser_name)(glm45_tokenizer)
    tokens = glm45_tokenizer.tokenize(prompt)
    token_ids = glm45_tokenizer.convert_tokens_to_ids(tokens)
    check_is_reasoning_end = parser.is_reasoning_end(token_ids)
    assert check_is_reasoning_end == is_reasoning_end
--- a/tests/utils.py
+++ b/tests/utils.py
@ -91,8 +91,10 @@ class RemoteOpenAIServer:
        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
        if env_dict is not None:
            env.update(env_dict)
        serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
        print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
        self.proc: subprocess.Popen = subprocess.Popen(
-            ["vllm", "serve", model, *vllm_serve_args],
+            serve_cmd,
            env=env,
            stdout=sys.stdout,
            stderr=sys.stderr,
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -3,7 +3,7 @@
 """Utility functions for attention-related v1 tests."""
 from dataclasses import dataclass
-from typing import Union
+from typing import Optional, Union
 import pytest
 import torch
@ -260,3 +260,88 @@ def create_dummy_kv_cache(block_size: int,
        dtype=dtype,
        device=device)
    return kv_cache
@dataclass
 class BackendConfig:
    name: str
    env_vars: dict
    comp_config: dict  # compilation config
    specific_gpu_arch: Optional[tuple] = None
 # Define all backend configurations of full cudagraph to be tested
 full_cg_backend_configs = {
    # FA3 on Hopper
    "FA3":
    BackendConfig(name="FA3",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
                      "VLLM_FLASH_ATTN_VERSION": "3",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL",
                  },
                  specific_gpu_arch=(9, 0)),
    # FlashMLA on Hopper
    "FlashMLA":
    BackendConfig(name="FlashMLA",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  },
                  specific_gpu_arch=(9, 0)),
    # Cutlass MLA on Blackwell
    "CutlassMLA":
    BackendConfig(
        name="CutlassMLA",
        env_vars={
            "VLLM_USE_V1": "1",
            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
            "FORCE_NUM_KV_SPLITS":
            "1",  # TODO: remove this when hang issue is fixed
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
        specific_gpu_arch=(10, 0)),
    # FlashAttention MLA on Hopper
    "FlashAttentionMLA":
    BackendConfig(name="FlashAttentionMLA",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_DECODE_ONLY",
                  },
                  specific_gpu_arch=(9, 0)),
    # FA2
    "FA2":
    BackendConfig(name="FA2",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
                      "VLLM_FLASH_ATTN_VERSION": "2",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
    # Triton Attention
    "TritonAttn":
    BackendConfig(name="TritonAttn",
                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
    # FlashInfer
    "FlashInfer":
    BackendConfig(name="FlashInfer",
                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
 }
--- a/tests/v1/core/test_kv_sharing.py
+++ b/tests/v1/core/test_kv_sharing.py
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@ -45,39 +45,22 @@ def _create_vllm_config(compilation_config: CompilationConfig,
 class TestCudagraphDispatcher:
    @pytest.mark.parametrize(
-        "params",
+        "case_id,cudagraph_mode_str,compilation_level",
        [
            # Test case 0: Full CG for mixed batches, no separate routine
-            {
+            (0, "FULL", CompilationLevel.NO_COMPILATION),
                "case_id": 0,
                "cudagraph_mode": "FULL",
                "compilation_level": CompilationLevel.NO_COMPILATION,
            },
            # Test case 1: Full CG for uniform batches, piecewise for mixed
-            {
+            (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION),
                "case_id": 1,
                "cudagraph_mode": "FULL_AND_PIECEWISE",
                "compilation_level": CompilationLevel.PIECEWISE,
            },
            # Test case 2: Full CG for uniform batches, no CG for mixed
-            {
+            (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION),
                "case_id": 2,
                "cudagraph_mode": "FULL_DECODE_ONLY",
                "compilation_level": CompilationLevel.NO_COMPILATION,
            },
            # Test case 3: Piecewise for all
-            {
+            (3, "PIECEWISE", CompilationLevel.PIECEWISE),
                "case_id": 3,
                "cudagraph_mode": "PIECEWISE",
                "compilation_level": CompilationLevel.PIECEWISE,
            },
        ])
-    def test_dispatcher(self, params):
+    def test_dispatcher(self, cudagraph_mode_str, compilation_level):
        # Setup dispatcher
-        comp_config = CompilationConfig(
+        comp_config = CompilationConfig(cudagraph_mode=cudagraph_mode_str,
-            cudagraph_mode=params["cudagraph_mode"],
+                                        level=compilation_level,
-            level=params["compilation_level"],
+                                        cudagraph_capture_sizes=[1, 8])
            cudagraph_capture_sizes=[1, 8])
        config = _create_vllm_config(comp_config, max_num_seqs=8)
        dispatcher = CudagraphDispatcher(config)
@ -86,11 +69,11 @@ class TestCudagraphDispatcher:
            uniform_decode_query_len=1)
        # Verify the key is initialized correctly
-        if params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+        if cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 2
        else:
            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 0
-        if params["cudagraph_mode"] not in ["NONE", "PIECEWISE"]:
+        if cudagraph_mode_str not in ["NONE", "PIECEWISE"]:
            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 2
        else:
            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 0
@ -99,10 +82,10 @@ class TestCudagraphDispatcher:
        # 1. non-uniform batch, size in cudagraph size list
        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
        rt_mode, key = dispatcher.dispatch(desc_full_exact)
-        if params["cudagraph_mode"] == "FULL":
+        if cudagraph_mode_str == "FULL":
            assert rt_mode == CUDAGraphMode.FULL
            assert key == desc_full_exact
-        elif params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+        elif cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
            assert rt_mode == CUDAGraphMode.PIECEWISE
            assert key == desc_full_exact
        else:
@ -111,15 +94,13 @@ class TestCudagraphDispatcher:
        # 2. uniform decode batch, size in cudagraph size list
        desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
        rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
-        if params["cudagraph_mode"] == "FULL":
+        if cudagraph_mode_str == "FULL":
            assert rt_mode == CUDAGraphMode.FULL
            assert key == desc_uniform_exact.non_uniform
-        elif params["cudagraph_mode"] in [
+        elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]:
                "FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"
        ]:
            assert rt_mode == CUDAGraphMode.FULL
            assert key == desc_uniform_exact
-        elif params["cudagraph_mode"] == "PIECEWISE":
+        elif cudagraph_mode_str == "PIECEWISE":
            assert rt_mode == CUDAGraphMode.PIECEWISE
            assert key == desc_uniform_exact.non_uniform
        else:
@ -131,6 +112,16 @@ class TestCudagraphDispatcher:
        assert rt_mode == CUDAGraphMode.NONE
        assert key is None
        # 4. Cascade attention should have a fall back mode
        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
        rt_mode, key = dispatcher.dispatch(desc_full_exact,
                                           use_cascade_attn=True)
        if "PIECEWISE" in cudagraph_mode_str:  # string contains check
            assert rt_mode == CUDAGraphMode.PIECEWISE
            assert key == desc_full_exact.non_uniform
        else:
            assert rt_mode == CUDAGraphMode.NONE
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 class TestCUDAGraphWrapper:
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@ -4,12 +4,11 @@ import contextlib
 import os
 import weakref
 from contextlib import ExitStack
 from dataclasses import dataclass
 from typing import Optional
 import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
@ -34,74 +33,6 @@ def temporary_environ(env_vars):
                os.environ[k] = v
@dataclass
 class BackendConfig:
    name: str
    env_vars: dict
    comp_config: dict
    specific_gpu_arch: Optional[tuple] = None
 # Define all backend configurations of full cudagraph to be tested
 backend_configs = {
    # FA3 on Hopper
    "FA3":
    BackendConfig(name="FA3",
                  env_vars={
                      "VLLM_FLASH_ATTN_VERSION": "3",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL",
                  },
                  specific_gpu_arch=(9, 0)),
    # FlashMLA on Hopper
    "FlashMLA":
    BackendConfig(name="FlashMLA",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  },
                  specific_gpu_arch=(9, 0)),
    # FlashAttention MLA on Hopper
    "FlashAttentionMLA":
    BackendConfig(name="FlashAttentionMLA",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_DECODE_ONLY",
                  },
                  specific_gpu_arch=(9, 0)),
    # FA2
    "FA2":
    BackendConfig(name="FA2",
                  env_vars={
                      "VLLM_FLASH_ATTN_VERSION": "2",
                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
    # Triton Attention
    "TritonAttn":
    BackendConfig(name="TritonAttn",
                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
    # FlashInfer
    "FlashInfer":
    BackendConfig(name="FlashInfer",
                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
 }
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
 combo_cases_1 = [
@ -114,9 +45,10 @@ combo_cases_1 = [
 ]
-@pytest.mark.parametrize("combo_case", combo_cases_1)
+@pytest.mark.parametrize("backend_name, cudagraph_mode, supported",
-def test_backend_and_cudagraph_mode_combo(combo_case):
+                         combo_cases_1)
-    backend_name, cudagraph_mode, supported = combo_case
+def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode,
                                          supported):
    if backend_name == "FlashInfer":
        try:
            import flashinfer  # noqa: F401
@ -142,7 +74,7 @@ def test_backend_and_cudagraph_mode_combo(combo_case):
                  compilation_config=CompilationConfig(
                      level=3, cudagraph_mode=cudagraph_mode))
        llm.generate(["Hello, my name is"] * 10)
-
+    # when above code raises, `llm` may be undefined, so we need to catch that
    try:
        llm = weakref.proxy(llm)
        del llm
@ -173,7 +105,8 @@ combo_cases_2 = [
 ]
-@pytest.mark.parametrize("combo_case", combo_cases_2)
+@pytest.mark.parametrize("backend_name,cudagraph_mode,compilation_level,"\
                         "supported", combo_cases_2)
 def test_cudagraph_compilation_combo(combo_case):
    backend_name, cudagraph_mode, compilation_level, supported\
        = combo_case
@ -192,6 +125,7 @@ def test_cudagraph_compilation_combo(combo_case):
                  compilation_config=CompilationConfig(
                      level=compilation_level, cudagraph_mode=cudagraph_mode))
        llm.generate(["Hello, my name is"] * 10)
    # when above code raises, `llm` may be undefined, so we need to catch that
    try:
        llm = weakref.proxy(llm)
        del llm
--- a/tests/v1/distributed/init.py
+++ b/tests/v1/distributed/init.py
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
--- a/tests/v1/distributed/test_external_lb_dp.py
+++ b/tests/v1/distributed/test_external_lb_dp.py
--- a/tests/v1/distributed/test_hybrid_lb_dp.py
+++ b/tests/v1/distributed/test_hybrid_lb_dp.py
@ -12,7 +12,7 @@ import pytest_asyncio
 import requests
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 from vllm.platforms import current_platform
 MODEL_NAME = "ibm-research/PowerMoE-3b"
--- a/tests/v1/distributed/test_internal_lb_dp.py
+++ b/tests/v1/distributed/test_internal_lb_dp.py
@ -13,7 +13,7 @@ import pytest_asyncio
 import requests
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 from vllm.platforms import current_platform
 MODEL_NAME = "ibm-research/PowerMoE-3b"
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@ -8,13 +8,15 @@ from typing import Any, Union
 import pytest
 import torch
-from tests.utils import get_attn_backend_list_based_on_platform
+from tests.utils import get_attn_backend_list_based_on_platform, large_gpu_mark
 from vllm import LLM, SamplingParams
 from vllm.assets.base import VLLM_S3_BUCKET_URL
 from vllm.assets.image import VLM_IMAGES_DIR
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform
 MTP_SIMILARITY_RATE = 0.8
 def get_test_prompts(mm_enabled: bool):
    prompt_types = ["repeat", "sentence"]
@ -86,69 +88,71 @@ def test_ngram_correctness(
    Compare the outputs of an original LLM and a speculative LLM
    should be the same when using ngram speculative decoding.
    '''
-    with monkeypatch.context() as m:
+    test_prompts = get_test_prompts(mm_enabled=False)
        m.setenv("VLLM_USE_V1", "1")
        test_prompts = get_test_prompts(mm_enabled=False)
-        ref_llm = LLM(model=model_name, max_model_len=1024)
+    ref_llm = LLM(model=model_name, max_model_len=1024)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
+    del ref_llm
-        torch.cuda.empty_cache()
+    torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
+    cleanup_dist_env_and_memory()
-        spec_llm = LLM(
+    spec_llm = LLM(
-            model=model_name,
+        model=model_name,
-            speculative_config={
+        speculative_config={
-                "method": "ngram",
+            "method": "ngram",
-                "prompt_lookup_max": 5,
+            "prompt_lookup_max": 5,
-                "prompt_lookup_min": 3,
+            "prompt_lookup_min": 3,
-                "num_speculative_tokens": 3,
+            "num_speculative_tokens": 3,
-            },
+        },
-            max_model_len=1024,
+        max_model_len=1024,
-        )
+    )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
+    matches = 0
-        misses = 0
+    misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
+            matches += 1
-            else:
+        else:
-                misses += 1
+            misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
-        # Heuristic: expect at least 66% of the prompts to match exactly
+    # Heuristic: expect at least 66% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
+    # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches >= int(0.66 * len(ref_outputs))
+    assert matches >= int(0.66 * len(ref_outputs))
-        del spec_llm
+    del spec_llm
-        torch.cuda.empty_cache()
+    torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
+    cleanup_dist_env_and_memory()
-@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+@pytest.mark.parametrize(
-    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+    ["model_setup", "mm_enabled"],
-    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+    [
-      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+        (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
-    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+        pytest.param(("eagle3", "Qwen/Qwen2.5-VL-7B-Instruct",
-      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+                      "Rayzl/qwen2.5-vl-7b-eagle3-sgl", 1),
-    pytest.param(
+                     False,
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+                     marks=pytest.mark.skip(reason="Skipping due to its " \
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+                               "head_dim not being a a multiple of 32")),
-        False,
+        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
-    pytest.param(
+        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        pytest.param(("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        True,
+                      "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+                     False,
-    (("eagle", "eagle618/deepseek-v3-random",
+                     marks=large_gpu_mark(min_gb=80)),  # works on 4x H100
-      "eagle618/eagle-deepseek-v3-random", 1), False),
+        pytest.param(("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-],
+                      "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-                         ids=[
+                     True,
-                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
+                     marks=large_gpu_mark(min_gb=80)),  # works on 4x H100
-                             "llama4_eagle", "llama4_eagle_mm",
+        (("eagle", "eagle618/deepseek-v3-random",
-                             "deepseek_eagle"
+          "eagle618/eagle-deepseek-v3-random", 1), False),
-                         ])
+    ],
    ids=[
        "qwen3_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", "llama3_eagle3",
        "llama4_eagle", "llama4_eagle_mm", "deepseek_eagle"
    ])
@pytest.mark.parametrize("attn_backend",
                         get_attn_backend_list_based_on_platform())
 def test_eagle_correctness(
@ -172,9 +176,14 @@ def test_eagle_correctness(
    model_setup: (method, model_name, eagle_model_name, tp_size)
    '''
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+        if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
-        m.setenv("VLLM_MLA_DISABLE", "1")
+            # Scout requires default backend selection
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+            # because vision encoder has head_dim 88 being incompatible
            #  with FLASH_ATTN and needs to fall back to Flex Attn
            pass
        else:
            m.setenv("VLLM_MLA_DISABLE", "1")
            m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
        if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
            pytest.skip("TRITON_ATTN does not support "
@ -222,3 +231,66 @@ def test_eagle_correctness(
        del spec_llm
        torch.cuda.empty_cache()
        cleanup_dist_env_and_memory()
@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
    (("mtp", "XiaomiMiMo/MiMo-7B-Base", 1), False),
    (("mtp", "ZixiQi/DeepSeek-V3-4layers-MTP-FP8", 1), False),
 ],
                         ids=["mimo", "deepseek"])
 def test_mtp_correctness(
    monkeypatch: pytest.MonkeyPatch,
    sampling_config: SamplingParams,
    model_setup: tuple[str, str, int],
    mm_enabled: bool,
 ):
    # Generate test prompts inside the function instead of using fixture
    test_prompts = get_test_prompts(mm_enabled)
    '''
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using MTP speculative decoding.
    model_setup: (method, model_name, tp_size)
    '''
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_MLA_DISABLE", "1")
        method, model_name, tp_size = model_setup
        ref_llm = LLM(model=model_name,
                      max_model_len=2048,
                      tensor_parallel_size=tp_size,
                      trust_remote_code=True)
        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
        del ref_llm
        torch.cuda.empty_cache()
        cleanup_dist_env_and_memory()
        spec_llm = LLM(
            model=model_name,
            trust_remote_code=True,
            tensor_parallel_size=tp_size,
            speculative_config={
                "method": method,
                "num_speculative_tokens": 1,
                "max_model_len": 2048,
            },
            max_model_len=2048,
        )
        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
        matches = 0
        misses = 0
        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
            if ref_output.outputs[0].text == spec_output.outputs[0].text:
                matches += 1
            else:
                misses += 1
                print(f"ref_output: {ref_output.outputs[0].text}")
                print(f"spec_output: {spec_output.outputs[0].text}")
        # Heuristic: expect at least 80% of the prompts to match exactly
        # Upon failure, inspect the outputs to check for inaccuracy.
        assert matches > int(MTP_SIMILARITY_RATE * len(ref_outputs))
        del spec_llm
        torch.cuda.empty_cache()
        cleanup_dist_env_and_memory()
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 MODEL_NAME = "ibm-research/PowerMoE-3b"
--- a/tests/v1/generation/test_batch_invariance.py
+++ b/tests/v1/generation/test_batch_invariance.py
@ -0,0 +1,290 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import os
 import random
 import string
 import pytest
 import torch
 from vllm import LLM, SamplingParams
 def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
    # Lightweight random prompt generator to vary prompt lengths and content.
    vocab = [
        "alpha",
        "bravo",
        "charlie",
        "delta",
        "echo",
        "foxtrot",
        "golf",
        "hotel",
        "india",
        "juliet",
        "kilo",
        "lima",
        "mike",
        "november",
        "oscar",
        "papa",
        "quebec",
        "romeo",
        "sierra",
        "tango",
        "uniform",
        "victor",
        "whiskey",
        "xray",
        "yankee",
        "zulu",
    ]
    n = random.randint(min_words, max_words)
    words = random.choices(vocab, k=n)
    # Add some noise and punctuation variability
    if random.random() < 0.5:
        words[0] = words[0].capitalize()
    if random.random() < 0.2:
        words.append("".join(random.choices(string.ascii_lowercase, k=5)))
    punct = random.choice([".", "?", "!", "...", ""])
    return " ".join(words) + punct
@pytest.mark.timeout(1000)
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
    """
    Ensures that the same request (the 'needle' prompt) yields identical output
    whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
    using the high-level v1 LLM() API only (no manual batching).
    Strategy:
    - Create two LLM engines with identical config except max_num_seqs: 1 vs N.
    - Compute a baseline output for the needle prompt with the bs=1 engine.
    - For many trials, generate a batch (size N) where the needle appears at a
      random position among random filler prompts using the bs=N engine.
    - Track how many trials match vs mismatch, and report totals at the end.
      The test fails if any mismatches occur, but we still dump pass/fail
      counts.
    Notes:
    - Use seeded stochastic sampling with a fixed seed to test determinism.
    - Outputs are intentionally longer and sampled at higher temperature/top_p
      to produce a more random-sounding phrase, yet remain deterministic by 
      seed.
    - Keep max_tokens and max_model_len bounded for speed and memory use.
    """
    random.seed(12345)
    # Allow overrides from environment (useful for CI tuning)
    # "facebook/opt-125m" is too small, doesn't reliably test determinism
    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
    num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5"))
    batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "64"))
    assert batch_size >= 2, "Batch size should be >= 2 to mix needle."
    # Keep GPU memory usage low to avoid startup allocation failures.
    gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.3"))
    max_model_len = int(os.getenv("VLLM_MAX_MODEL_LEN", "4096"))
    swap_space_gb = int(os.getenv("VLLM_SWAP_SPACE_GB", "4"))
    # Sampling parameters: longer outputs with a more random-sounding
    # continuation,but still deterministic due to fixed seed.
    temperature = float(os.getenv("VLLM_NEEDLE_TEMPERATURE", "0.0"))
    top_p = float(os.getenv("VLLM_NEEDLE_TOP_P", "0.95"))
    max_tokens = int(os.getenv("VLLM_NEEDLE_MAX_TOKENS", "128"))
    sampling = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        seed=20240919,
    )
    needle_prompt = ("There once was a ")
    llm_bs1 = None
    llm_bsN = None
    try:
        # Engine with bs=1 behavior
        llm_bs1 = LLM_with_max_seqs(
            model=model,
            max_num_seqs=1,
            gpu_memory_utilization=gpu_mem_util,
            max_model_len=max_model_len,
            swap_space=swap_space_gb,
        )
        # Baseline generation for the needle prompt alone.
        baseline_out = llm_bs1.generate([needle_prompt], sampling)
        assert len(baseline_out) == 1
        assert len(baseline_out[0].outputs) >= 1
        baseline_text = baseline_out[0].outputs[0].text
        # Engine with larger batch limit (e.g., 64)
        llm_bsN = LLM_with_max_seqs(
            model=model,
            max_num_seqs=batch_size,
            gpu_memory_utilization=gpu_mem_util,
            max_model_len=max_model_len,
            swap_space=swap_space_gb,
        )
        mismatches = 0
        for trial in range(num_trials):
            # Create a batch of size `batch_size` and insert the needle at
            # a random index
            prompts: list[str] = []
            needle_pos = random.randint(0, batch_size - 1)
            for i in range(batch_size):
                if i == needle_pos:
                    prompts.append(needle_prompt)
                else:
                    prompts.append(_random_prompt())
            # Generate with the larger-batch engine
            outputs = llm_bsN.generate(prompts, sampling)
            # Find the needle output by position
            needle_output = outputs[needle_pos]
            assert needle_output.prompt == needle_prompt
            assert len(needle_output.outputs) >= 1
            text = needle_output.outputs[0].text
            if text != baseline_text:
                mismatches += 1
        passes = num_trials - mismatches
        # Dump how many passed vs failed
        print(f"[determinism] total={num_trials}, passed={passes}, "
              f"failed={mismatches}, batch_size={batch_size}")
        if mismatches > 0:
            pytest.fail(
                f"Nondeterministic outputs detected: {mismatches} failed out "
                f"of {num_trials} trials (batch_size={batch_size}).")
    finally:
        # Ensure engines are shutdown to free GPU/VRAM across test sessions
        if llm_bs1 is not None:
            with contextlib.suppress(Exception):
                llm_bs1.shutdown()
        if llm_bsN is not None:
            with contextlib.suppress(Exception):
                llm_bsN.shutdown()
 def _extract_step_logprobs(request_output):
    if getattr(request_output, "outputs", None):
        inner = request_output.outputs[0]
        if hasattr(inner, "logprobs") and inner.logprobs is not None:
            t = torch.tensor(
                [
                    inner.logprobs[i][tid].logprob
                    for i, tid in enumerate(inner.token_ids)
                ],
                dtype=torch.float32,
            )
            return t
    return None
@pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="Requires CUDA to match production inference path.",
 )
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2():
    #model_name = os.getenv("VLLM_TEST_MODEL", "facebook/opt-125m")
    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
    tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
    # Force float32 to avoid precision-induced differences.
    llm = LLM(
        model=model_name,
        tensor_parallel_size=tp_size,
        enforce_eager=True,  # helps reduce nondeterminism from some backends
    )
    prompts = [
        "The capital of France is",
        "The capital of Germany is",
    ]
    sp = SamplingParams(
        temperature=0.0,
        top_p=1.0,
        max_tokens=8,
        # Seed shouldn't matter at temperature=0, but keeping it stable anyway.
        seed=1234,
        logprobs=5,
    )
    # BS=1: run prompts individually and collect logprobs per step.
    bs1_logprobs_per_prompt = []
    for p in prompts:
        outs = llm.generate([p], sp, use_tqdm=False)
        assert len(outs) == 1
        step_logprobs = _extract_step_logprobs(outs[0])
        if step_logprobs is None:
            pytest.skip("Logits are not available on RequestOutput; "
                        "enable logprobs return to run this test.")
        bs1_logprobs_per_prompt.append(step_logprobs)
    # BS=2: run prompts in a batch and collect logprobs per step for each
    # prompt.
    outs_batched = llm.generate(prompts, sp, use_tqdm=False)
    assert len(outs_batched) == len(prompts)
    bs2_logprobs_per_prompt = []
    for o in outs_batched:
        step_logprobs = _extract_step_logprobs(o)
        if step_logprobs is None:
            pytest.skip("Logits are not available on RequestOutput; "
                        "enable logprobs return to run this test.")
        bs2_logprobs_per_prompt.append(step_logprobs)
    # Compare step-by-step logprobs for each prompt between BS=1 and BS=2 runs.
    for i, (logprobs_bs1, logprobs_bs2) in enumerate(
            zip(bs1_logprobs_per_prompt, bs2_logprobs_per_prompt)):
        assert len(logprobs_bs1) == len(logprobs_bs2), (
            f"Different number of generation steps for prompt index {i}: "
            f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bs2)} (BS=2)")
        for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bs2)):
            assert a.shape == b.shape, (
                f"Logits shape mismatch at prompt {i}, step {t}: "
                f"{a.shape} vs {b.shape}")
            # Bitwise exact equality.
            assert torch.equal(
                a, b), (f"Bitwise logprobs mismatch at prompt {i}, step {t} "
                        f"(dtype={a.dtype}, shape={a.shape}).")
 def LLM_with_max_seqs(
    model: str,
    max_num_seqs: int,
    gpu_memory_utilization: float,
    max_model_len: int,
    swap_space: int,
 ) -> LLM:
    """
    Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
    using the high-level v1 LLM API, while constraining memory usage.
    """
    return LLM(
        model=model,
        max_num_seqs=max_num_seqs,
        # Constrain GPU memory pool so test can run even on busy GPUs.
        gpu_memory_utilization=gpu_memory_utilization,
        # Keep KV cache footprint small while allowing longer outputs.
        max_model_len=max_model_len,
        # Allow some CPU offload if needed.
        swap_space=swap_space,
        # Keep things lean and CI-friendly.
        dtype="float16",
        # Single-GPU by default; override externally if desired.
        tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
        trust_remote_code=os.getenv("VLLM_TRUST_REMOTE_CODE", "0") == "1",
    )
--- a/tests/v1/metrics/test_metrics_reader.py
+++ b/tests/v1/metrics/test_metrics_reader.py
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@ -0,0 +1,195 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from unittest import mock
 import pytest
 import torch
 from tests.v1.attention.utils import (BatchSpec, _Backend,
                                      create_common_attn_metadata,
                                      create_standard_kv_cache_spec,
                                      get_attention_backend)
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                         VllmConfig)
 from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
 mimo_7b_dir = "XiaomiMiMo/MiMo-7B-Base"
 def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
    """Create an MTP proposer with unified model configuration."""
    model_config = ModelConfig(model=mimo_7b_dir,
                               runner="generate",
                               max_model_len=100,
                               trust_remote_code=True)
    speculative_config = SpeculativeConfig(
        target_model_config=model_config,
        target_parallel_config=ParallelConfig(),
        model=mimo_7b_dir,
        method="mtp",
        num_speculative_tokens=num_speculative_tokens,
    )
    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=CacheConfig(),
        speculative_config=speculative_config,
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
        scheduler_config=SchedulerConfig())
    return EagleProposer(vllm_config=vllm_config,
                         device=current_platform.device_type)
@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
@mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
@mock.patch('vllm.v1.spec_decode.eagle.get_model')
 def test_mtp_load_model_unified(mock_get_model, mock_get_layers,
                                mock_get_pp_group):
    """Test MTP-specific model loading with unified model approach."""
    # Setup mocks
    mock_model = mock.MagicMock()
    mock_model.model.embed_tokens.weight.shape = (131072, 4096)
    mock_get_model.return_value = mock_model
    target_attn_layers = {"target_attn_1": mock.MagicMock()}
    all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
    mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
    mock_pp_group = mock.MagicMock()
    mock_pp_group.world_size = 1
    mock_get_pp_group.return_value = mock_pp_group
    # Create target model
    class _TargetModelStub(LlamaForCausalLM):
        model: mock.MagicMock
        lm_head: mock.MagicMock
    target_model = mock.create_autospec(_TargetModelStub, instance=True)
    target_model.model = mock.MagicMock()
    target_model.model.embed_tokens.weight.shape = (131072, 4096)
    target_model.lm_head = mock.MagicMock()
    # Create MTP proposer
    proposer = _create_mtp_proposer(num_speculative_tokens=4)
    proposer.load_model(target_model)
    # Verify MTP-specific behavior:
    # Model is loaded
    mock_get_model.assert_called_once()
    # MTP shares lm_head with target model
    assert proposer.model.lm_head == target_model.lm_head
    # MTP shares embed_tokens with target model
    assert proposer.model.model.embed_tokens == target_model.model.embed_tokens
@pytest.mark.parametrize("num_speculative_tokens", [1])
 def test_mtp_propose(num_speculative_tokens, monkeypatch):
    """Test that MTP's forward method returns hidden states directly"""
    device = torch.device(current_platform.device_type)
    batch_size = 2
    seq_lens = [5, 3]
    total_tokens = sum(seq_lens)
    vocab_size = 100
    proposer = _create_mtp_proposer(num_speculative_tokens)
    hidden_size = proposer.hidden_size
    # Mock the MTP model to verify it returns hidden states directly
    model_mock = mock.MagicMock()
    # MTP returns hidden states directly
    if num_speculative_tokens == 1:
        model_mock.return_value = torch.zeros(total_tokens,
                                              hidden_size,
                                              device=device)
    else:
        # Multiple forward passes for multi-token speculation
        forward_returns = []
        for i in range(num_speculative_tokens):
            if i == 0:
                h_states = torch.zeros(total_tokens,
                                       hidden_size,
                                       device=device)
            else:
                h_states = torch.zeros(batch_size, hidden_size, device=device)
            forward_returns.append(h_states)
        model_mock.side_effect = forward_returns
    # Mock compute_logits
    def create_deterministic_logits(batch_size, vocab_size, token_offset):
        logits = torch.full((batch_size, vocab_size), -100.0, device=device)
        logits[:, token_offset] = 100.0
        return logits
    if num_speculative_tokens == 1:
        model_mock.compute_logits.return_value = create_deterministic_logits(
            batch_size, vocab_size, 42)
    else:
        logits_returns = [
            create_deterministic_logits(batch_size, vocab_size, 42 + i)
            for i in range(num_speculative_tokens)
        ]
        model_mock.compute_logits.side_effect = logits_returns
    proposer.model = model_mock
    proposer.attn_layer_names = ["layer.0"]
    # Prepare inputs
    batch_spec = BatchSpec(seq_lens=seq_lens, query_lens=seq_lens)
    common_attn_metadata = create_common_attn_metadata(batch_spec,
                                                       block_size=16,
                                                       device=device)
    target_token_ids = torch.randint(0,
                                     vocab_size, (total_tokens, ),
                                     device=device)
    target_positions = torch.cat([
        torch.arange(seq_lens[0], device=device),
        torch.arange(seq_lens[1], device=device)
    ])
    target_hidden_states = torch.randn(total_tokens,
                                       hidden_size,
                                       device=device)
    next_token_ids = torch.randint(0,
                                   vocab_size, (batch_size, ),
                                   dtype=torch.int32,
                                   device=device)
    sampling_metadata = mock.MagicMock()
    # Setup attention metadata
    attn_metadata_builder_cls, _ = get_attention_backend(_Backend.FLASH_ATTN)
    attn_metadata_builder = attn_metadata_builder_cls(
        kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
        layer_names=proposer.attn_layer_names,
        vllm_config=proposer.vllm_config,
        device=device,
    )
    proposer.runner = mock.MagicMock()
    proposer.attn_metadata_builder = attn_metadata_builder
    # Run propose
    result = proposer.propose(target_token_ids=target_token_ids,
                              target_positions=target_positions,
                              target_hidden_states=target_hidden_states,
                              next_token_ids=next_token_ids,
                              last_token_indices=None,
                              common_attn_metadata=common_attn_metadata,
                              sampling_metadata=sampling_metadata)
    # Verify the model was called correctly
    assert model_mock.called
    # Verify output shape
    assert result.shape == (batch_size, num_speculative_tokens)
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@ -1,71 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import regex as re
 import requests
 import torch
 from tests.utils import RemoteOpenAIServer
 from vllm.v1.worker.utils import bind_kv_cache
 def test_bind_kv_cache():
    from vllm.attention import Attention
    ctx = {
        'layers.0.self_attn': Attention(32, 128, 0.1),
        'layers.1.self_attn': Attention(32, 128, 0.1),
        'layers.2.self_attn': Attention(32, 128, 0.1),
        'layers.3.self_attn': Attention(32, 128, 0.1),
    }
    kv_cache = {
        'layers.0.self_attn': torch.zeros((1, )),
        'layers.1.self_attn': torch.zeros((1, )),
        'layers.2.self_attn': torch.zeros((1, )),
        'layers.3.self_attn': torch.zeros((1, )),
    }
    runner_kv_caches: list[torch.Tensor] = []
    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
        'layers.0.self_attn']
    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
        'layers.1.self_attn']
    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
        'layers.2.self_attn']
    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
        'layers.3.self_attn']
    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
 def test_bind_kv_cache_non_attention():
    from vllm.attention import Attention
    # example from Jamba PP=2
    ctx = {
        'model.layers.20.attn': Attention(32, 128, 0.1),
        'model.layers.28.attn': Attention(32, 128, 0.1),
    }
    kv_cache = {
        'model.layers.20.attn': torch.zeros((1, )),
        'model.layers.28.attn': torch.zeros((1, )),
    }
    runner_kv_caches: list[torch.Tensor] = []
    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
        'model.layers.20.attn']
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
        'model.layers.28.attn']
    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
 # Prometheus metrics utilities for testing
--- a/tests/v1/worker/test_utils.py
+++ b/tests/v1/worker/test_utils.py
@ -0,0 +1,63 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from vllm.v1.worker.utils import bind_kv_cache
 def test_bind_kv_cache():
    from vllm.attention import Attention
    ctx = {
        'layers.0.self_attn': Attention(32, 128, 0.1),
        'layers.1.self_attn': Attention(32, 128, 0.1),
        'layers.2.self_attn': Attention(32, 128, 0.1),
        'layers.3.self_attn': Attention(32, 128, 0.1),
    }
    kv_cache = {
        'layers.0.self_attn': torch.zeros((1, )),
        'layers.1.self_attn': torch.zeros((1, )),
        'layers.2.self_attn': torch.zeros((1, )),
        'layers.3.self_attn': torch.zeros((1, )),
    }
    runner_kv_caches: list[torch.Tensor] = []
    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
        'layers.0.self_attn']
    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
        'layers.1.self_attn']
    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
        'layers.2.self_attn']
    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
        'layers.3.self_attn']
    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
 def test_bind_kv_cache_non_attention():
    from vllm.attention import Attention
    # example from Jamba PP=2
    ctx = {
        'model.layers.20.attn': Attention(32, 128, 0.1),
        'model.layers.28.attn': Attention(32, 128, 0.1),
    }
    kv_cache = {
        'model.layers.20.attn': torch.zeros((1, )),
        'model.layers.28.attn': torch.zeros((1, )),
    }
    runner_kv_caches: list[torch.Tensor] = []
    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
        'model.layers.20.attn']
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
        'model.layers.28.attn']
    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
--- a/tools/flashinfer-build.sh
+++ b/tools/flashinfer-build.sh
@ -0,0 +1,63 @@
 #!/usr/bin/env bash
 # This script is used to build FlashInfer wheels with AOT kernels
 set -ex
 # FlashInfer configuration
 FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}"
 CUDA_VERSION="${CUDA_VERSION}"
 BUILD_WHEEL="${BUILD_WHEEL:-true}"
 if [[ -z "${FLASHINFER_GIT_REF}" ]]; then
    echo "❌ FLASHINFER_GIT_REF must be specified" >&2
    exit 1
 fi
 if [[ -z "${CUDA_VERSION}" ]]; then
    echo "❌ CUDA_VERSION must be specified" >&2
    exit 1
 fi
 echo "🏗️  Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
 # Clone FlashInfer
 git clone --depth 1 --recursive --shallow-submodules \
    --branch ${FLASHINFER_GIT_REF} \
    ${FLASHINFER_GIT_REPO} flashinfer
 # Set CUDA arch list based on CUDA version
 # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
 if [[ "${CUDA_VERSION}" == 11.* ]]; then
    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
 elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
 else
    # CUDA 12.8+ supports 10.0a and 12.0
    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
 fi
 echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
 pushd flashinfer
    # Make sure the wheel is built for the correct CUDA version
    export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
    # Build AOT kernels
    export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
    export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
    python3 -m flashinfer.aot
    if [[ "${BUILD_WHEEL}" == "true" ]]; then
        # Build wheel for distribution
        uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist .
        echo "✅ FlashInfer wheel built successfully in flashinfer-dist/"
    else
        # Install directly (for Dockerfile)
        uv pip install --system --no-build-isolation --force-reinstall .
        echo "✅ FlashInfer installed successfully"
    fi
 popd
 # Cleanup
 rm -rf flashinfer
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -1450,6 +1450,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
        ):
            dataset_class = MLPerfDataset
            args.hf_split = "train"
        elif (
            args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS
            or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS
        ):
            dataset_class = MMStarDataset
            args.hf_split = "val"
            args.hf_subset = None
        else:
            supported_datasets = set([
                dataset_name for cls in HuggingFaceDataset.__subclasses__()
@ -2721,3 +2728,76 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
        random.shuffle(requests)
        return requests
 # -----------------------------------------------------------------------------
 # MMStar Dataset Implementation
 # -----------------------------------------------------------------------------
 class MMStarDataset(HuggingFaceDataset):
    """
    Lin-Chen/MMStar: https://huggingface.co/datasets/Lin-Chen/MMStar
    refer to: https://github.com/sgl-project/SpecForge/pull/106
    """
    DEFAULT_OUTPUT_LEN = 128
    SUPPORTED_DATASET_PATHS = {"Lin-Chen/MMStar"}
    IS_MULTIMODAL = True
    def sample(
        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        request_id_prefix: str = "",
        no_oversample: bool = False,
        **kwargs,
    ) -> list[SampleRequest]:
        # If --hf-output-len is not set, use the default output length.
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests: list[SampleRequest] = []
        for ind, item in enumerate(self.data):
            if len(sampled_requests) >= num_requests:
                break
            # Split the question text from options
            # (keep only the part before "Options:").
            full_q: str = item.get("question", "")
            question_text = full_q.split("Options:", 1)[0].strip()
            # Multimodal image content.
            mm_content = process_image(item["image"])
            # Compute prompt token length (note: this is plain text length
            # if enable_multimodal_chat is False).
            prompt_len = len(tokenizer(question_text).input_ids)
            if enable_multimodal_chat:
                # If multimodal content should be embedded in the chat message,
                # convert to [{"role":"user","content":[...]}]
                prompt = self.apply_multimodal_chat_transformation(
                    question_text, mm_content
                )
                mm_for_request = None  # Already embedded in chat content.
            else:
                # Default: prompt is plain text,
                # image is in mm_content for the bench to assemble.
                prompt = question_text
                mm_for_request = mm_content
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_for_request,
                    request_id=request_id_prefix + str(ind),
                )
            )
        self.maybe_oversample_requests(
            sampled_requests, num_requests, request_id_prefix, no_oversample
        )
        return sampled_requests
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@ -340,15 +340,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                num_graphs=len(self.compile_submod_names),
                runtime_shape=None)
            # Lazy import here to avoid circular import
-            from .cuda_piecewise_backend import PiecewiseBackend
+            from .piecewise_backend import PiecewiseBackend
            piecewise_backend = PiecewiseBackend(
                submod, self.vllm_config, index,
                len(self.compile_submod_names), sym_shape_indices,
                compiled_graph_for_dynamic_shape, self.vllm_backend)
-            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            if (self.compilation_config.cudagraph_mode.\
-                    and
+                    has_piecewise_cudagraphs() and
                    not self.compilation_config.use_inductor_graph_partition):
                # We're using Dynamo-based piecewise splitting, so we wrap
                # the whole subgraph with a static graph wrapper.
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@ -336,7 +336,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
    from vllm.config import CUDAGraphMode
    compilation_config = vllm_config.compilation_config
-    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+    if (compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
            and compilation_config.use_inductor_graph_partition):
        from torch._inductor.utils import CUDAGraphWrapperMetadata
@ -365,7 +365,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
    yield
-    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+    if (compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
            and compilation_config.use_inductor_graph_partition):
        torch._inductor.utils.set_customized_partition_wrappers(None)
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@ -459,15 +459,22 @@ class VllmConfig:
                           "to True to enable.")
        current_platform.check_and_update_config(self)
-        # final check of cudagraph mode after platform-specific update
+        # Do this after all the updates to compilation_config.level
        if envs.VLLM_USE_V1 and \
            self.compilation_config.level == CompilationLevel.PIECEWISE:
            self.compilation_config.set_splitting_ops_for_v1()
        # final check of cudagraph mode after all possible updates
        if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
-            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
+            if self.compilation_config.cudagraph_mode.has_full_cudagraphs()\
                and self.model_config is not None and \
-                not self.model_config.disable_cascade_attn:
+                not self.model_config.disable_cascade_attn and\
-                logger.info("CUDAGraphMode.FULL is not supported with "
+                not self.compilation_config.cudagraph_mode.\
-                            "cascade attention currently. Disabling cascade"
+                has_piecewise_cudagraphs():
-                            "attention.")
+                logger.warning_once(
-                self.model_config.disable_cascade_attn = True
+                    "No piecewise cudagraph for executing cascade attention."
                    " Will fall back to eager execution if a batch runs "
                    "into cascade attentions")
            if self.compilation_config.cudagraph_mode\
                .requires_piecewise_compilation():
@ -477,6 +484,12 @@ class VllmConfig:
                    "when cudagraph_mode piecewise cudagraphs is used, "\
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
            # final migrate the deprecated flags
            self.compilation_config.use_cudagraph = self.compilation_config.\
                cudagraph_mode!= CUDAGraphMode.NONE
            self.compilation_config.full_cuda_graph = self.compilation_config.\
                cudagraph_mode.has_full_cudagraphs()
        if self.parallel_config.enable_dbo:
            a2a_backend = envs.VLLM_ALL2ALL_BACKEND
            assert a2a_backend in \
@ -487,14 +500,14 @@ class VllmConfig:
            "variable to deepep_low_latency or deepep_high_throughput and "\
            "install the DeepEP kernels."
            if not self.model_config.disable_cascade_attn:
                self.model_config.disable_cascade_attn = True
                logger.warning_once(
                    "Disabling cascade attention when DBO is enabled.")
        if not self.instance_id:
            self.instance_id = random_uuid()[:5]
        # Do this after all the updates to compilation_config.level
        if envs.VLLM_USE_V1 and \
            self.compilation_config.level == CompilationLevel.PIECEWISE:
            self.compilation_config.set_splitting_ops_for_v1()
        if (envs.VLLM_USE_V1
                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
            # logger should only print warning message for hybrid models. As we
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -61,9 +61,17 @@ class CUDAGraphMode(enum.Enum):
    def has_full_cudagraphs(self) -> bool:
        return self.max_cudagraph_mode() == CUDAGraphMode.FULL
    def has_piecewise_cudagraphs(self) -> bool:
        return self.requires_piecewise_compilation()
    def separate_routine(self) -> bool:
        return isinstance(self.value, tuple)
    def valid_runtime_modes(self) -> bool:
        return self in [
            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
        ]
@config
@dataclass
@ -269,7 +277,8 @@ class CompilationConfig:
    Note that this is orthogonal to the cudagraph capture logic
    outside of compilation.
    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=PIECEWISE
    instead.
    """
    cudagraph_num_of_warmups: int = 0
    """Number of warmup runs for cudagraph.
@ -294,7 +303,8 @@ class CompilationConfig:
    flag cannot be used together with splitting_ops. This may provide
    performance benefits for smaller models.
    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=
    FULL_AND_PIECEWISE instead.
    """
    use_inductor_graph_partition: bool = False
@ -464,7 +474,8 @@ class CompilationConfig:
        if not self.use_cudagraph:
            logger.warning("use_cudagraph is deprecated, use "
                           "cudagraph_mode=NONE instead.")
-            if self.cudagraph_mode is not None:
+            if self.cudagraph_mode is not None and \
                self.cudagraph_mode != CUDAGraphMode.NONE:
                raise ValueError(
                    "use_cudagraph and cudagraph_mode are mutually"
                    " exclusive, prefer cudagraph_mode since "
@ -473,7 +484,8 @@ class CompilationConfig:
        if self.full_cuda_graph:
            logger.warning("full_cuda_graph is deprecated, use "
                           "cudagraph_mode=FULL instead.")
-            if self.cudagraph_mode is not None:
+            if self.cudagraph_mode is not None and \
                not self.cudagraph_mode.has_full_cudagraphs():
                raise ValueError("full_cuda_graph and cudagraph_mode are "
                                 "mutually exclusive, prefer cudagraph_mode "
                                 "since full_cuda_graph is deprecated.")
@ -570,48 +582,75 @@ class CompilationConfig:
            "set_splitting_ops_for_v1 should only be called when "
            "level is CompilationLevel.PIECEWISE")
        if self.use_inductor_graph_partition:
            self.set_splitting_ops_for_inductor_graph_partition()
            return
        if self.pass_config.enable_attn_fusion:
            # here use_inductor_graph_partition is False
            self.set_splitting_ops_for_attn_fusion()
            return
        if self.splitting_ops is None:
            # NOTE: When using full cudagraph, instead of setting an empty
            # list and capture the full cudagraph inside the flattened fx
            # graph, we keep the piecewise fx graph structure but capture
            # the full cudagraph outside the fx graph. This reduces some
            # cpu overhead when the runtime batch_size is not cudagraph
            # captured. see https://github.com/vllm-project/vllm/pull/20059
            # for details. Make a copy to avoid mutating the class-level
            # list via reference.
            self.splitting_ops = list(self._attention_ops)
        elif len(self.splitting_ops) == 0:
            logger.warning_once(
                "Using piecewise compilation with empty splitting_ops")
            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
                logger.warning_once(
                    "Piecewise compilation with empty splitting_ops do not" \
                    "contains piecewise cudagraph. Setting cudagraph_"
                    "mode to NONE. Hint: If you are using attention backends "
                    "that support cudagraph, consider manually setting "
                    "cudagraph_mode to FULL or FULL_DECODE_ONLY to enable "
                    "full cudagraphs.")
                self.cudagraph_mode = CUDAGraphMode.NONE
            elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
                logger.warning_once(
                    "Piecewise compilation with empty splitting_ops do not "
                    "contains piecewise cudagraph. Setting cudagraph_mode "
                    "to FULL.")
                self.cudagraph_mode = CUDAGraphMode.FULL
            self.splitting_ops = []
    def set_splitting_ops_for_inductor_graph_partition(self):
        assert self.use_inductor_graph_partition
        use_inductor_graph_partition_msg = (
            "When use_inductor_graph_partition=True, splitting_ops "
            "are ignored and set to an empty list. Instead, "
            "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
            "used to annotate custom ops for graph partition.")
-
+        if self.splitting_ops is not None and \
-        if self.splitting_ops is None:
+            len(self.splitting_ops) > 0:
            if self.use_inductor_graph_partition:
                # When using inductor graph partition, we set splitting_ops
                # to be empty and rely on torch._C.Tag.cudagraph_unsafe to
                # annotate custom ops as splitting ops.
                logger.warning_once(use_inductor_graph_partition_msg)
                self.splitting_ops = []
            else:
                # NOTE: When using full cudagraph, instead of setting an empty
                # list and capture the full cudagraph inside the flattened fx
                # graph, we keep the piecewise fx graph structure but capture
                # the full cudagraph outside the fx graph. This reduces some
                # cpu overhead when the runtime batch_size is not cudagraph
                # captured. see https://github.com/vllm-project/vllm/pull/20059
                # for details. make a copy to avoid mutating the class-level
                # list via reference.
                self.splitting_ops = list(self._attention_ops)
        elif len(self.splitting_ops) == 0:
            logger.warning_once(
                "Using piecewise compilation with empty "
                "splitting_ops and use_inductor_graph_partition"
                f"={self.use_inductor_graph_partition}.")
            if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE
                    and not self.use_inductor_graph_partition):
                logger.warning_once(
                    "When compilation level is piecewise with empty "
                    "splitting_ops, PIECEWISE cudagraph_mode will be "
                    "treated as FULL cudagraph_mode. Please ensure you are "
                    "using attention backends that support cudagraph or set "
                    "cudagraph_mode to NONE explicitly if encountering "
                    "any problems.")
                self.cudagraph_mode = CUDAGraphMode.FULL
            self.splitting_ops = []
        elif self.use_inductor_graph_partition:
            logger.warning_once(use_inductor_graph_partition_msg)
        self.splitting_ops = []
    def set_splitting_ops_for_attn_fusion(self):
        assert self.pass_config.enable_attn_fusion
        if self.splitting_ops is None:
            self.splitting_ops = []
            if self.cudagraph_mode.has_piecewise_cudagraphs():
                logger.warning_once(
                    "enable_attn_fusion is incompatible with piecewise "
                    "cudagraph when use_inductor_graph_partition is off."
                    "In this case, splitting_ops will be set to empty "
                    "list, and cudagraph_mode will be set to FULL. "
                    "Please ensure you are using attention backends that "
                    "support cudagraph or set cudagraph_mode to NONE "
                    "explicitly if encountering any problems.")
                self.cudagraph_mode = CUDAGraphMode.FULL
        assert not self.splitting_ops_contain_attention(), (
            "attention ops should not be in splitting_ops "
            "when enable_attn_fusion is True")
    def splitting_ops_contain_attention(self) -> bool:
        return self.splitting_ops is not None and all(
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -137,6 +137,9 @@ class ModelConfig:
    """Allowing API requests to read local images or videos from directories
    specified by the server file system. This is a security risk. Should only
    be enabled in trusted environments."""
    allowed_media_domains: Optional[list[str]] = None
    """If set, only media URLs that belong to this domain can be used for 
    multi-modal inputs. """
    revision: Optional[str] = None
    """The specific model version to use. It can be a branch name, a tag name,
    or a commit id. If unspecified, will use the default version."""
@ -506,9 +509,14 @@ class ModelConfig:
                else:  # task == "auto"
                    pass
            else:
                debug_info = {
                    "architectures": architectures,
                    "is_generative_model": is_generative_model,
                    "is_pooling_model": is_pooling_model,
                }
                raise AssertionError("The model should be a generative or "
                                     "pooling model when task is set to "
-                                     f"{self.task!r}.")
+                                     f"{self.task!r}. Found: {debug_info}")
            self.runner = runner
            self.convert = convert
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -279,6 +279,24 @@ class ParallelConfig:
        assert last_exc is not None
        raise last_exc
    # The all_reduce at the end of attention (during o_proj) means that
    # inputs are replicated across each rank of the tensor parallel group.
    # If using expert-parallelism with DeepEP All2All ops, replicated
    # tokens results in useless duplicate computation and communication.
    #
    # In this case, ensure the input to the experts is sequence parallel
    # to avoid the excess work.
    #
    # Not needed for pplx-kernels as it can handle duplicate input tokens.
    @property
    def use_sequence_parallel_moe(self) -> bool:
        return (envs.VLLM_ALL2ALL_BACKEND
                in ("allgather_reducescatter", "naive",
                    "deepep_high_throughput", "deepep_low_latency")
                and self.enable_expert_parallel
                and self.tensor_parallel_size > 1
                and self.data_parallel_size > 1)
    @staticmethod
    def has_unfinished_dp(dp_group: ProcessGroup,
                          has_unfinished: bool) -> bool:
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@ -32,7 +32,9 @@ logger = init_logger(__name__)
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
                            "mlp_speculator", "draft_model", "deepseek_mtp",
                            "ernie_mtp", "qwen3_next_mtp", "mimo_mtp",
-                            "longcat_flash_mtp"]
+                            "longcat_flash_mtp", "mtp"]
 MTP_MODEL_TYPES = ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp", "ernie_mtp",
                   "qwen3_next_mtp", "longcat_flash_mtp")
@config
@ -207,11 +209,16 @@ class SpeculativeConfig:
        # can not be detected, it will be considered as the "draft_model" by
        # default.
        if self.method in MTP_MODEL_TYPES:
            logger.warning("method `%s` is deprecated and replaced with mtp.",
                           self.method)
            self.method = "mtp"
        if self.model is None and self.num_speculative_tokens is not None:
-            # TODO(Shangming): Refactor mtp configuration logic when supporting
+            if self.method == "mtp":
-            if (self.target_model_config
+                assert (
-                    and self.target_model_config.hf_text_config.model_type
+                    self.target_model_config
-                    in ("deepseek_v3", "mimo", "ernie4_5_moe", "qwen3_next")):
+                    is not None), "target_model_config must be present for mtp"
                # use the draft model from the same model:
                self.model = self.target_model_config.model
                # Align the quantization of draft model for cases such as
@ -281,6 +288,8 @@ class SpeculativeConfig:
                    trust_remote_code,
                    allowed_local_media_path=self.target_model_config.
                    allowed_local_media_path,
                    allowed_media_domains=self.target_model_config.
                    allowed_media_domains,
                    dtype=self.target_model_config.dtype,
                    seed=self.target_model_config.seed,
                    revision=self.revision,
@ -312,31 +321,13 @@ class SpeculativeConfig:
                      "mlp_speculator"):
                    self.method = "mlp_speculator"
                elif (self.draft_model_config.hf_config.model_type
-                      in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")):
+                      in MTP_MODEL_TYPES):
-                    self.method = "deepseek_mtp"
+                    self.method = "mtp"
                    if self.num_speculative_tokens > 1:
                        logger.warning(
-                                "All Deepseek MTP models only have " \
+                                "Enabling num_speculative_tokens > 1 will run" \
-                                "one layer. Might need some code changes " \
+                                "multiple times of forward on same MTP layer" \
-                                "to support multiple layers."
+                                ",which may result in lower acceptance rate" \
                            )
                elif (self.draft_model_config.hf_config.model_type ==
                      "ernie_mtp"):
                    self.method = "ernie_mtp"
                    if self.num_speculative_tokens > 1:
                        logger.warning(
                                "All Ernie MTP models only have " \
                                "one layer. Might need some code changes " \
                                "to support multiple layers."
                            )
                elif (self.draft_model_config.hf_config.model_type ==
                      "qwen3_next_mtp"):
                    self.method = "qwen3_next_mtp"
                    if self.num_speculative_tokens > 1:
                        logger.warning(
                                "All Qwen3Next MTP models only have " \
                                "one layer. Might need some code changes " \
                                "to support multiple layers."
                            )
                elif (self.draft_model_config.hf_config.model_type
                      in ("longcat_flash_mtp")):
@ -353,7 +344,7 @@ class SpeculativeConfig:
                        "Speculative decoding with draft model is not "
                        "supported yet. Please consider using other "
                        "speculative decoding methods such as ngram, medusa, "
-                        "eagle, or deepseek_mtp.")
+                        "eagle, or mtp.")
                # Replace hf_config for EAGLE draft_model
                if self.method in ("eagle", "eagle3"):
@ -562,8 +553,7 @@ class SpeculativeConfig:
        return self.num_speculative_tokens
    def use_eagle(self) -> bool:
-        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp",
+        return self.method in ("eagle", "eagle3", "mtp")
                               "qwen3_next_mtp", "longcat_flash_mtp")
    def __repr__(self) -> str:
        method = self.method
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@ -6,7 +6,7 @@ import torch
 import torch.distributed as dist
 import vllm.envs as envs
-from vllm.distributed import get_dp_group
+from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.utils import has_deep_ep, has_pplx
@ -34,41 +34,60 @@ class NaiveAll2AllManager(All2AllManagerBase):
        super().__init__(cpu_group)
    def naive_multicast(self, x: torch.Tensor,
-                        cu_tokens_across_dp_cpu: torch.Tensor):
+                        cu_tokens_across_sp_cpu: torch.Tensor,
                        is_sequence_parallel: bool) -> torch.Tensor:
        assert (len(x.shape) == 2)
-        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+        buffer = torch.empty((cu_tokens_across_sp_cpu[-1], x.size(1)),
                             device=x.device,
                             dtype=x.dtype)
-        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+        rank = self.rank if is_sequence_parallel else self.dp_rank
-            self.dp_rank - 1]
+        world_size = (self.world_size
-        end = cu_tokens_across_dp_cpu[self.dp_rank]
+                      if is_sequence_parallel else self.dp_world_size)
        start = 0 if rank == 0 else cu_tokens_across_sp_cpu[rank - 1]
        end = cu_tokens_across_sp_cpu[rank]
        buffer[start:end, :].copy_(x)
-        for idx in range(self.dp_world_size):
+        for idx in range(world_size):
-            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
+            start = 0 if idx == 0 else cu_tokens_across_sp_cpu[idx - 1]
-            end = cu_tokens_across_dp_cpu[idx]
+            end = cu_tokens_across_sp_cpu[idx]
-            self.dp_group.broadcast(buffer[start:end, :], idx)
+            get_ep_group().broadcast(buffer[start:end, :], idx)
        return buffer
-    def dispatch(self, hidden_states: torch.Tensor,
+    def dispatch(
-                 router_logits: torch.Tensor):
+        self,
-        sizes = get_forward_context(
+        hidden_states: torch.Tensor,
-        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+        router_logits: torch.Tensor,
-        hidden_states, router_logits = get_dp_group().all_gatherv(
+        is_sequence_parallel: bool = False
-            [hidden_states, router_logits],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
-            dim=0,
+        sp_size = self.tp_group.world_size if is_sequence_parallel else 1
-            sizes=sizes,
+        dp_metadata = get_forward_context().dp_metadata
-        )
+        cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
        hidden_states = self.naive_multicast(hidden_states,
                                             cu_tokens_across_sp_cpu,
                                             is_sequence_parallel)
        router_logits = self.naive_multicast(router_logits,
                                             cu_tokens_across_sp_cpu,
                                             is_sequence_parallel)
        return hidden_states, router_logits
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
-        sizes = get_forward_context(
+                hidden_states: torch.Tensor,
-        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+                is_sequence_parallel: bool = False) -> torch.Tensor:
-        hidden_states = get_dp_group().reduce_scatterv(hidden_states,
+
-                                                       dim=0,
+        ep_rank = self.rank if is_sequence_parallel else self.dp_rank
-                                                       sizes=sizes)
+
        dp_metadata = get_forward_context().dp_metadata
        sp_size = self.tp_group.world_size if is_sequence_parallel else 1
        cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
        start = 0 if ep_rank == 0 else cu_tokens_across_sp_cpu[ep_rank - 1]
        end = cu_tokens_across_sp_cpu[ep_rank]
        all_hidden_states = get_ep_group().all_reduce(hidden_states)
        hidden_states = all_hidden_states[start:end, :]
        return hidden_states
    def destroy(self):
@ -84,29 +103,40 @@ class AgRsAll2AllManager(All2AllManagerBase):
    def __init__(self, cpu_group):
        super().__init__(cpu_group)
-    def dispatch(self, hidden_states: torch.Tensor,
+    def dispatch(
-                 router_logits: torch.Tensor):
+        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        is_sequence_parallel: bool = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Gather hidden_states and router_logits from all dp ranks.
        """
        sizes = get_forward_context(
        ).dp_metadata.get_chunk_sizes_across_dp_rank()
-        hidden_states, router_logits = get_dp_group().all_gatherv(
+
        dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
        assert sizes[dist_group.rank_in_group] == hidden_states.shape[0]
        hidden_states, router_logits = dist_group.all_gatherv(
            [hidden_states, router_logits],
            dim=0,
            sizes=sizes,
        )
        return hidden_states, router_logits
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
                hidden_states: torch.Tensor,
                is_sequence_parallel: bool = False) -> torch.Tensor:
        """
        Reduce-scatter hidden_states across all dp ranks.
        """
        sizes = get_forward_context(
        ).dp_metadata.get_chunk_sizes_across_dp_rank()
-        hidden_states = get_dp_group().reduce_scatterv(hidden_states,
+
-                                                       dim=0,
+        dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
-                                                       sizes=sizes)
+        hidden_states = dist_group.reduce_scatterv(hidden_states,
                                                   dim=0,
                                                   sizes=sizes)
        return hidden_states
    def destroy(self):
@ -148,11 +178,17 @@ class PPLXAll2AllManager(All2AllManagerBase):
            kwargs, pplx.AllToAll.internode
            if self.internode else pplx.AllToAll.intranode)
-    def dispatch(self, hidden_states: torch.Tensor,
+    def dispatch(
-                 router_logits: torch.Tensor):
+        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        is_sequence_parallel: bool = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
                hidden_states: torch.Tensor,
                is_sequence_parallel: bool = False) -> torch.Tensor:
        raise NotImplementedError
    def destroy(self):
@ -184,11 +220,17 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
    def get_handle(self, kwargs):
        raise NotImplementedError
-    def dispatch(self, hidden_states: torch.Tensor,
+    def dispatch(
-                 router_logits: torch.Tensor):
+        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        is_sequence_parallel: bool = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
                hidden_states: torch.Tensor,
                is_sequence_parallel: bool = False) -> torch.Tensor:
        raise NotImplementedError
    def destroy(self):
@ -395,4 +437,4 @@ class FlashInferAllToAllManager(All2AllManagerBase):
                self.workspace_tensor = None
                self.prepare_workspace_tensor = None
                self.mapping = None
-                self.initialized = False
+                self.initialized = False
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@ -28,6 +28,8 @@ class Cache:
 class All2AllManagerBase:
    rank: int
    world_size: int
    def __init__(self, cpu_group):
        self.cpu_group = cpu_group
@ -40,6 +42,7 @@ class All2AllManagerBase:
        # all2all lives in ep group, which is merged from dp and tp group
        self.dp_group = get_dp_group()
        self.tp_group = get_tp_group()
        # no self.ep_group since self.ep_group is still in construction
        # when we create this object
        self.dp_rank = self.dp_group.rank_in_group
@ -60,17 +63,21 @@ class All2AllManagerBase:
        # and reuse it for the same config.
        raise NotImplementedError
    def dispatch(self,
                 hidden_states: torch.Tensor,
                 router_logits: torch.Tensor,
                 is_sequence_parallel: bool = False):
        raise NotImplementedError
    def set_num_sms(self, num_sms: int):
        pass
    def max_sms_used(self) -> Optional[int]:
        return None  # None means it could use the whole GPU
-    def dispatch(self, hidden_states: torch.Tensor,
+    def combine(self,
-                 router_logits: torch.Tensor):
+                hidden_states: torch.Tensor,
-        raise NotImplementedError
+                is_sequence_parallel: bool = False):
    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError
    def destroy(self):
@ -267,15 +274,20 @@ class DeviceCommunicatorBase:
            module.quant_method.init_prepare_finalize(module)
    def dispatch(
-            self, hidden_states: torch.Tensor,
+        self,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        is_sequence_parallel: bool = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Dispatch the hidden states and router logits to the appropriate device.
        This is a no-op in the base class.
        """
        return hidden_states, router_logits
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
                hidden_states: torch.Tensor,
                is_sequence_parallel: bool = False) -> torch.Tensor:
        """
        Combine the hidden states and router logits from the appropriate device.
        This is a no-op in the base class.
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@ -39,10 +39,6 @@ class CudaCommunicator(DeviceCommunicatorBase):
            use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
            use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM
        # ep does not use pynccl
        use_pynccl = "ep" not in unique_name
        self.use_pynccl = use_pynccl
        self.use_custom_allreduce = use_custom_allreduce
        self.use_torch_symm_mem = use_torch_symm_mem
@ -57,7 +53,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
            SymmMemCommunicator)
        self.pynccl_comm: Optional[PyNcclCommunicator] = None
-        if use_pynccl and self.world_size > 1:
+        if self.world_size > 1:
            self.pynccl_comm = PyNcclCommunicator(
                group=self.cpu_group,
                device=self.device,
@ -308,14 +304,20 @@ class CudaCommunicator(DeviceCommunicatorBase):
        return output_list
    def dispatch(
-            self, hidden_states: torch.Tensor,
+        self,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        is_sequence_parallel: bool = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        assert self.all2all_manager is not None
        hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits)
+            hidden_states, router_logits, is_sequence_parallel)
        return hidden_states, router_logits
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
                hidden_states: torch.Tensor,
                is_sequence_parallel: bool = False) -> torch.Tensor:
        assert self.all2all_manager is not None
-        hidden_states = self.all2all_manager.combine(hidden_states)
+        hidden_states = self.all2all_manager.combine(hidden_states,
                                                     is_sequence_parallel)
        return hidden_states
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@ -75,14 +75,20 @@ class XpuCommunicator(DeviceCommunicatorBase):
        dist.broadcast(input_, src=src, group=self.device_group)
    def dispatch(
-            self, hidden_states: torch.Tensor,
+        self,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        is_sequence_parallel: bool = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        assert self.all2all_manager is not None
        hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits)
+            hidden_states, router_logits, is_sequence_parallel)
        return hidden_states, router_logits
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
                hidden_states: torch.Tensor,
                is_sequence_parallel: bool = False) -> torch.Tensor:
        assert self.all2all_manager is not None
-        hidden_states = self.all2all_manager.combine(hidden_states)
+        hidden_states = self.all2all_manager.combine(hidden_states,
                                                     is_sequence_parallel)
        return hidden_states
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -871,17 +871,24 @@ class GroupCoordinator:
                model)
    def dispatch(
-            self, hidden_states: torch.Tensor,
+        self,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        is_sequence_parallel: bool = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if self.device_communicator is not None:
            return self.device_communicator.dispatch(hidden_states,
-                                                     router_logits)
+                                                     router_logits,
                                                     is_sequence_parallel)
        else:
            return hidden_states, router_logits
-    def combine(self, hidden_states) -> torch.Tensor:
+    def combine(self,
                hidden_states,
                is_sequence_parallel: bool = False) -> torch.Tensor:
        if self.device_communicator is not None:
-            return self.device_communicator.combine(hidden_states)
+            return self.device_communicator.combine(hidden_states,
                                                    is_sequence_parallel)
        else:
            return hidden_states
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -297,6 +297,8 @@ class EngineArgs:
    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
    trust_remote_code: bool = ModelConfig.trust_remote_code
    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
    allowed_media_domains: Optional[
        list[str]] = ModelConfig.allowed_media_domains
    download_dir: Optional[str] = LoadConfig.download_dir
    safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
    load_format: Union[str, LoadFormats] = LoadConfig.load_format
@ -531,6 +533,8 @@ class EngineArgs:
                                 **model_kwargs["hf_config_path"])
        model_group.add_argument("--allowed-local-media-path",
                                 **model_kwargs["allowed_local_media_path"])
        model_group.add_argument("--allowed-media-domains",
                                 **model_kwargs["allowed_media_domains"])
        model_group.add_argument("--revision", **model_kwargs["revision"])
        model_group.add_argument("--code-revision",
                                 **model_kwargs["code_revision"])
@ -997,6 +1001,7 @@ class EngineArgs:
            tokenizer_mode=self.tokenizer_mode,
            trust_remote_code=self.trust_remote_code,
            allowed_local_media_path=self.allowed_local_media_path,
            allowed_media_domains=self.allowed_media_domains,
            dtype=self.dtype,
            seed=self.seed,
            revision=self.revision,
@ -1481,7 +1486,7 @@ class EngineArgs:
                raise NotImplementedError(
                    "Draft model speculative decoding is not supported yet. "
                    "Please consider using other speculative decoding methods "
-                    "such as ngram, medusa, eagle, or deepseek_mtp.")
+                    "such as ngram, medusa, eagle, or mtp.")
        V1_BACKENDS = [
            "FLASH_ATTN",
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -11,7 +11,12 @@ from pathlib import Path
 from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
                    cast)
 import jinja2
 import jinja2.ext
 import jinja2.meta
 import jinja2.nodes
 import jinja2.parser
 import jinja2.sandbox
 import transformers.utils.chat_template_utils as hf_chat_utils
 # yapf conflicts with isort for this block
 # yapf: disable
@ -50,7 +55,7 @@ from vllm.transformers_utils.chat_templates import (
 # yapf: enable
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import random_uuid
+from vllm.utils import random_uuid, supports_kw
 logger = init_logger(__name__)
@ -632,6 +637,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
    def allowed_local_media_path(self):
        return self._model_config.allowed_local_media_path
    @property
    def allowed_media_domains(self):
        return self._model_config.allowed_media_domains
    @property
    def mm_registry(self):
        return MULTIMODAL_REGISTRY
@ -832,6 +841,7 @@ class MultiModalContentParser(BaseMultiModalContentParser):
        self._connector = MediaConnector(
            media_io_kwargs=media_io_kwargs,
            allowed_local_media_path=tracker.allowed_local_media_path,
            allowed_media_domains=tracker.allowed_media_domains,
        )
    def parse_image(
@ -916,6 +926,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
        self._connector = MediaConnector(
            media_io_kwargs=media_io_kwargs,
            allowed_local_media_path=tracker.allowed_local_media_path,
            allowed_media_domains=tracker.allowed_media_domains,
        )
    def parse_image(
@ -1548,6 +1559,46 @@ def parse_chat_messages_futures(
    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
 # adapted from https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/utils/chat_template_utils.py#L398-L412
 # only preserve the parse function used to resolve chat template kwargs
 class AssistantTracker(jinja2.ext.Extension):
    tags = {"generation"}
    def parse(self, parser: jinja2.parser.Parser) -> jinja2.nodes.CallBlock:
        lineno = next(parser.stream).lineno
        body = parser.parse_statements(["name:endgeneration"], drop_needle=True)
        call = self.call_method("_generation_support")
        call_block = jinja2.nodes.CallBlock(call, [], [], body)
        return call_block.set_lineno(lineno)
 def resolve_chat_template_kwargs(
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    chat_template: str,
    chat_template_kwargs: dict[str, Any],
 ) -> dict[str, Any]:
    fn_kw = {
        k for k in chat_template_kwargs
        if supports_kw(tokenizer.apply_chat_template, k, allow_var_kwargs=False)
    }
    env = jinja2.sandbox.ImmutableSandboxedEnvironment(
        trim_blocks=True,
        lstrip_blocks=True,
        extensions=[AssistantTracker, jinja2.ext.loopcontrols],
    )
    parsed_content = env.parse(chat_template)
    template_vars = jinja2.meta.find_undeclared_variables(parsed_content)
    # We exclude chat_template from kwargs here, because
    # chat template has been already resolved at this stage
    unexpected_vars = {"chat_template"}
    accept_vars = (fn_kw | template_vars) - unexpected_vars
    return {
        k: v for k, v in chat_template_kwargs.items() if k in accept_vars
    }
 def apply_hf_chat_template(
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    conversation: list[ConversationMessage],
@ -1573,12 +1624,17 @@ def apply_hf_chat_template(
        )
    try:
        resolved_kwargs = resolve_chat_template_kwargs(
            tokenizer=tokenizer,
            chat_template=hf_chat_template,
            chat_template_kwargs=kwargs,
        )
        return tokenizer.apply_chat_template(
            conversation=conversation,  # type: ignore[arg-type]
            tools=tools,  # type: ignore[arg-type]
            chat_template=hf_chat_template,
            tokenize=tokenize,
-            **kwargs,
+            **resolved_kwargs,
        )
    # External library exceptions can sometimes occur despite the framework's
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -86,6 +86,8 @@ class LLM:
            or videos from directories specified by the server file system.
            This is a security risk. Should only be enabled in trusted
            environments.
        allowed_media_domains: If set, only media URLs that belong to this 
            domain can be used for multi-modal inputs.
        tensor_parallel_size: The number of GPUs to use for distributed
            execution with tensor parallelism.
        dtype: The data type for the model weights and activations. Currently,
@ -169,6 +171,7 @@ class LLM:
        skip_tokenizer_init: bool = False,
        trust_remote_code: bool = False,
        allowed_local_media_path: str = "",
        allowed_media_domains: Optional[list[str]] = None,
        tensor_parallel_size: int = 1,
        dtype: ModelDType = "auto",
        quantization: Optional[QuantizationMethods] = None,
@ -264,6 +267,7 @@ class LLM:
            skip_tokenizer_init=skip_tokenizer_init,
            trust_remote_code=trust_remote_code,
            allowed_local_media_path=allowed_local_media_path,
            allowed_media_domains=allowed_media_domains,
            tensor_parallel_size=tensor_parallel_size,
            dtype=dtype,
            quantization=quantization,
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -3,12 +3,14 @@
 import asyncio
 import gc
 import hashlib
 import importlib
 import inspect
 import json
 import multiprocessing
 import multiprocessing.forkserver as forkserver
 import os
 import secrets
 import signal
 import socket
 import tempfile
@ -1252,7 +1254,7 @@ def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
 class AuthenticationMiddleware:
    """
    Pure ASGI middleware that authenticates each request by checking
-    if the Authorization header exists and equals "Bearer {api_key}".
+    if the Authorization Bearer token exists and equals anyof "{api_key}".
    Notes
    -----
@ -1263,7 +1265,26 @@ class AuthenticationMiddleware:
    def __init__(self, app: ASGIApp, tokens: list[str]) -> None:
        self.app = app
-        self.api_tokens = {f"Bearer {token}" for token in tokens}
+        self.api_tokens = [
            hashlib.sha256(t.encode("utf-8")).digest() for t in tokens
        ]
    def verify_token(self, headers: Headers) -> bool:
        authorization_header_value = headers.get("Authorization")
        if not authorization_header_value:
            return False
        scheme, _, param = authorization_header_value.partition(" ")
        if scheme.lower() != "bearer":
            return False
        param_hash = hashlib.sha256(param.encode("utf-8")).digest()
        token_match = False
        for token_hash in self.api_tokens:
            token_match |= secrets.compare_digest(param_hash, token_hash)
        return token_match
    def __call__(self, scope: Scope, receive: Receive,
                 send: Send) -> Awaitable[None]:
@ -1276,8 +1297,7 @@ class AuthenticationMiddleware:
        url_path = URL(scope=scope).path.removeprefix(root_path)
        headers = Headers(scope=scope)
        # Type narrow to satisfy mypy.
-        if url_path.startswith("/v1") and headers.get(
+        if url_path.startswith("/v1") and not self.verify_token(headers):
                "Authorization") not in self.api_tokens:
            response = JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return response(scope, receive, send)
@ -1696,6 +1716,7 @@ async def init_app_state(
        request_logger=request_logger,
        chat_template=resolved_chat_template,
        chat_template_content_format=args.chat_template_content_format,
        trust_request_chat_template=args.trust_request_chat_template,
        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
        enable_auto_tools=args.enable_auto_tool_choice,
        exclude_tools_when_tool_choice_none=args.
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@ -103,9 +103,13 @@ class FrontendArgs:
    chat_template_content_format: ChatTemplateContentFormatOption = "auto"
    """The format to render message content within a chat template.
-* "string" will render the content as a string. Example: `"Hello World"`
+    * "string" will render the content as a string. Example: `"Hello World"`
-* "openai" will render the content as a list of dictionaries, similar to OpenAI
+    * "openai" will render the content as a list of dictionaries, similar to
-schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
+      OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
    trust_request_chat_template: bool = False
    """Whether to trust the chat template provided in the request. If False,
    the server will always use the chat template specified by `--chat-template`
    or the ones from tokenizer."""
    response_role: str = "assistant"
    """The role name to return if `request.add_generation_prompt=true`."""
    ssl_keyfile: Optional[str] = None
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -68,6 +68,7 @@ class OpenAIServingChat(OpenAIServing):
        request_logger: Optional[RequestLogger],
        chat_template: Optional[str],
        chat_template_content_format: ChatTemplateContentFormatOption,
        trust_request_chat_template: bool = False,
        return_tokens_as_token_ids: bool = False,
        reasoning_parser: str = "",
        enable_auto_tools: bool = False,
@ -89,6 +90,7 @@ class OpenAIServingChat(OpenAIServing):
        self.response_role = response_role
        self.chat_template = chat_template
        self.chat_template_content_format: Final = chat_template_content_format
        self.trust_request_chat_template = trust_request_chat_template
        self.enable_log_outputs = enable_log_outputs
        # set up tool use
@ -220,6 +222,16 @@ class OpenAIServingChat(OpenAIServing):
            if not self.use_harmony:
                # Common case.
                request_chat_template = request.chat_template
                chat_template_kwargs = request.chat_template_kwargs
                if not self.trust_request_chat_template and (
                        request_chat_template is not None or
                    (chat_template_kwargs and
                     chat_template_kwargs.get("chat_template") is not None)):
                    return self.create_error_response(
                        "Chat template is passed with request, but "
                        "--trust-request-chat-template is not set. "
                        "Refused request with untrusted chat template.")
                (
                    conversation,
                    request_prompts,
@ -228,7 +240,7 @@ class OpenAIServingChat(OpenAIServing):
                    request,
                    tokenizer,
                    request.messages,
-                    chat_template=request.chat_template or self.chat_template,
+                    chat_template=request_chat_template or self.chat_template,
                    chat_template_content_format=self.
                    chat_template_content_format,
                    add_generation_prompt=request.add_generation_prompt,
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@ -49,16 +49,29 @@ class BatchDescriptor(NamedTuple):
        return BatchDescriptor(self.num_tokens, uniform_decode=False)
-def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int],
+def _compute_sp_num_tokens(num_tokens_across_dp_cpu: torch.Tensor,
                           sequence_parallel_size: int) -> list[int]:
    sp_tokens = ((num_tokens_across_dp_cpu + sequence_parallel_size - 1) //
                 sequence_parallel_size)
    sp_tokens = sp_tokens.repeat_interleave(sequence_parallel_size)
    return sp_tokens.tolist()
 def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: torch.Tensor,
                                      sequence_parallel_size: int,
                                      max_num_tokens: int,
                                      chunk_idx: int) -> list[int]:
    dp_size = len(num_tokens_across_dp_cpu)
-    local_size = [-1] * dp_size
+    sp_tokens = _compute_sp_num_tokens(num_tokens_across_dp_cpu,
-    for i in range(dp_size):
+                                       sequence_parallel_size)
-        dp_tokens = num_tokens_across_dp_cpu[i]
+    sp_size = len(sp_tokens)
    local_size = [-1] * sp_size
    for i in range(sp_size):
        # Take into account sharding if MoE activation is sequence parallel.
        local_size[i] = min(max_num_tokens,
-                            dp_tokens - (max_num_tokens * chunk_idx))
+                            sp_tokens[i] - (max_num_tokens * chunk_idx))
        if local_size[i] <= 0:
            local_size[i] = 1  # ensure lockstep even if done
    return local_size
@ -67,7 +80,9 @@ def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int],
@dataclass
 class DPMetadata:
    max_tokens_across_dp_cpu: torch.Tensor
-    cu_tokens_across_dp_cpu: torch.Tensor
+    num_tokens_across_dp_cpu: torch.Tensor
    # NOTE: local_sizes should only be set by the chunked_sizes context manager
    local_sizes: Optional[list[int]] = None
    @staticmethod
@ -98,6 +113,17 @@ class DPMetadata:
        dist.all_reduce(num_tokens_tensor, group=group)
        return num_tokens_tensor.cpu()
    # Get the cumulative tokens across sequence parallel ranks.
    # In this case the input to the MoEs will be distributed w.r.t both
    # DP and TP rank.
    # When sp_size==1, this is just the cummulative num tokens across DP.
    def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
        num_tokens_across_sp_cpu = (
            (self.num_tokens_across_dp_cpu - 1 + sp_size) // sp_size)
        num_tokens_across_sp_cpu = (
            num_tokens_across_sp_cpu.repeat_interleave(sp_size))
        return torch.cumsum(num_tokens_across_sp_cpu, dim=0)
    @staticmethod
    def should_ubatch_across_dp(
            should_ubatch: bool, orig_num_tokens_per_ubatch: int,
@ -147,10 +173,10 @@ class DPMetadata:
    @staticmethod
    def make(
-            parallel_config: ParallelConfig,
+        parallel_config: ParallelConfig,
-            attn_metadata: Any,
+        attn_metadata: Any,
-            num_tokens: int,
+        num_tokens: int,
-            num_tokens_across_dp: Optional[torch.Tensor] = None
+        num_tokens_across_dp_cpu: Optional[torch.Tensor] = None
    ) -> "DPMetadata":
        assert parallel_config.data_parallel_size > 1
@ -167,18 +193,18 @@ class DPMetadata:
        # If num_tokens_across_dp is None, it will be computed by all_reduce
        # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
-        assert (num_tokens_across_dp is None or num_tokens_across_dp[dp_rank]
+        assert (num_tokens_across_dp_cpu is None
-                == batchsize), f"{num_tokens_across_dp[dp_rank]} {batchsize}"
+                or num_tokens_across_dp_cpu[dp_rank] == batchsize
-        if num_tokens_across_dp is None:
+                ), f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}"
-            num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+        if num_tokens_across_dp_cpu is None:
            num_tokens_across_dp_cpu = DPMetadata.num_tokens_across_dp(
                batchsize, dp_size, dp_rank)
-        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp_cpu)
-        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
+        return DPMetadata(max_tokens_across_dp_cpu, num_tokens_across_dp_cpu)
        return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu,
                          num_tokens_across_dp)
    @contextmanager
-    def chunked_sizes(self, max_chunk_size_per_rank: int, chunk_idx: int):
+    def chunked_sizes(self, sequence_parallel_size: int,
                      max_chunk_size_per_rank: int, chunk_idx: int):
        """
        Context manager to compute and temporarily set the per-rank local token
        sizes for a specific chunk during chunked forward execution.
@ -192,31 +218,40 @@ class DPMetadata:
        `chunk_idx`, this context manager sets `self.local_sizes` to the number
        of tokens to process in that chunk on each rank.
        It uses cumulative sizes (`cu_tokens_across_dp_cpu`) to derive the
        number of tokens per rank, and calls `_compute_chunked_local_num_tokens`
        to determine the chunk-wise split.
        `self.local_sizes` is only valid inside the context.
        Args:
            sequence_parallel_size: When Attn is TP and MoE layers are EP,
                                    we use SP between the layers to avoid
                                    redundant ops. We need this value to
                                    compute the chunked sizes.
            max_chunk_size_per_rank: The max number of tokens each rank is 
                                     allowed to process in this chunk.
            chunk_idx: The index of the chunk to compute sizes for.
        """
        cu_sizes = self.cu_tokens_across_dp_cpu
        num_tokens_across_dp_cpu = [
            (cu_sizes[i] -
             cu_sizes[i - 1]).item() if i > 0 else cu_sizes[0].item()
            for i in range(len(cu_sizes))
        ]
        self.local_sizes = _compute_chunked_local_num_tokens(
-            num_tokens_across_dp_cpu, max_chunk_size_per_rank, chunk_idx)
+            self.num_tokens_across_dp_cpu, sequence_parallel_size,
            max_chunk_size_per_rank, chunk_idx)
        try:
            yield self.local_sizes
        finally:
            self.local_sizes = None
    @contextmanager
    def sp_local_sizes(self, sequence_parallel_size: int):
        """
        Context mamager for setting self.local_sizes. Same as self.chunked_sizes
        but without any chunking.
        """
        self.local_sizes = _compute_sp_num_tokens(
            self.num_tokens_across_dp_cpu, sequence_parallel_size)
        try:
            yield self.local_sizes
        finally:
            self.local_sizes = None
    def get_chunk_sizes_across_dp_rank(self) -> Optional[list[int]]:
        assert self.local_sizes is not None
        return self.local_sizes
@ -246,8 +281,7 @@ class ForwardContext:
    ubatch_slices: Optional[UBatchSlices] = None
    def __post_init__(self):
-        assert self.cudagraph_runtime_mode in [
+        assert self.cudagraph_runtime_mode.valid_runtime_modes(), \
            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
            f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@ -0,0 +1,561 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import os
 from collections import namedtuple
 from collections.abc import Callable
 from typing import Any, Union
 import torch
 from vllm.triton_utils import tl, triton
 def _matmul_launch_metadata(grid: Callable[..., Any], kernel: Any,
                            args: dict[str, Any]) -> dict[str, Any]:
    ret = {}
    m, n, k = args["M"], args["N"], args["K"]
    ret["name"] = f"{kernel.name} [M={m}, N={n}, K={k}]"
    if "tiles_per_update" in args:
        ret["name"] = (f"{kernel.name} [M={m}, N={n}, K={k}, "
                       f"tiles_per_update={args['tiles_per_update']:02}]")
    if "c_ptr" in args:
        bytes_per_elem = args["c_ptr"].element_size()
    else:
        bytes_per_elem = 1 if args["FP8_OUTPUT"] else 2
    ret[f"flops{bytes_per_elem * 8}"] = 2.0 * m * n * k
    ret["bytes"] = bytes_per_elem * (m * k + n * k + m * n)
    return ret
@triton.jit
 def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
    group_id = tile_id // num_pid_in_group
    first_pid_m = group_id * GROUP_SIZE_M
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
    pid_m = first_pid_m + (tile_id % group_size_m)
    pid_n = (tile_id % num_pid_in_group) // group_size_m
    return pid_m, pid_n
@triton.jit(launch_metadata=_matmul_launch_metadata)
 def matmul_kernel_persistent(
    a_ptr,
    b_ptr,
    c_ptr,  #
    bias_ptr,
    M,
    N,
    K,  #
    stride_am,
    stride_ak,
    stride_bk,
    stride_bn,
    stride_cm,
    stride_cn,
    BLOCK_SIZE_M: tl.constexpr,  #
    BLOCK_SIZE_N: tl.constexpr,  #
    BLOCK_SIZE_K: tl.constexpr,  #
    GROUP_SIZE_M: tl.constexpr,  #
    NUM_SMS: tl.constexpr,  #
    A_LARGE: tl.constexpr,
    B_LARGE: tl.constexpr,
    C_LARGE: tl.constexpr,
    HAS_BIAS: tl.constexpr,
 ):
    start_pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
    num_tiles = num_pid_m * num_pid_n
    tile_id_c = start_pid - NUM_SMS
    offs_k_for_mask = tl.arange(0, BLOCK_SIZE_K)
    num_pid_in_group = GROUP_SIZE_M * num_pid_n
    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True):
        pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m,
                                    GROUP_SIZE_M, NUM_SMS)
        start_m = pid_m * BLOCK_SIZE_M
        start_n = pid_n * BLOCK_SIZE_N
        offs_am = start_m + tl.arange(0, BLOCK_SIZE_M)
        offs_bn = start_n + tl.arange(0, BLOCK_SIZE_N)
        if A_LARGE:
            offs_am = offs_am.to(tl.int64)
        if B_LARGE:
            offs_bn = offs_bn.to(tl.int64)
        offs_am = tl.where(offs_am < M, offs_am, 0)
        offs_bn = tl.where(offs_bn < N, offs_bn, 0)
        offs_am = tl.max_contiguous(tl.multiple_of(offs_am, BLOCK_SIZE_M),
                                    BLOCK_SIZE_M)
        offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn, BLOCK_SIZE_N),
                                    BLOCK_SIZE_N)
        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
        for ki in range(k_tiles):
            if A_LARGE or B_LARGE:
                offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(
                    tl.int64)
            else:
                offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
            a_ptrs = a_ptr + (offs_am[:, None] * stride_am +
                              offs_k[None, :] * stride_ak)
            b_ptrs = b_ptr + (offs_k[:, None] * stride_bk +
                              offs_bn[None, :] * stride_bn)
            a = tl.load(a_ptrs,
                        mask=offs_k_for_mask[None, :] < K - ki * BLOCK_SIZE_K,
                        other=0.0)
            b = tl.load(b_ptrs,
                        mask=offs_k_for_mask[:, None] < K - ki * BLOCK_SIZE_K,
                        other=0.0)
            accumulator = tl.dot(a, b, accumulator)
        tile_id_c += NUM_SMS
        pid_m, pid_n = _compute_pid(tile_id_c, num_pid_in_group, num_pid_m,
                                    GROUP_SIZE_M, NUM_SMS)
        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
        if C_LARGE:
            offs_cm = offs_cm.to(tl.int64)
            offs_cn = offs_cn.to(tl.int64)
        c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[
            None, :]
        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
        if HAS_BIAS:
            bias_ptrs = bias_ptr + offs_cn
            bias = tl.load(bias_ptrs, mask=offs_cn < N,
                           other=0.0).to(tl.float32)
            accumulator += bias
        if c_ptr.dtype.element_ty == tl.float8e4nv:
            c = accumulator.to(tl.float8e4nv)
        else:
            c = accumulator.to(tl.float16)
        tl.store(c_ptrs, c, mask=c_mask)
 def matmul_persistent(a: torch.Tensor,
                      b: torch.Tensor,
                      bias: Union[torch.Tensor, None] = None):
    # Check constraints.
    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
    assert a.dtype == b.dtype, "Incompatible dtypes"
    assert bias is None or bias.dim() == 1, (
        "Currently assuming bias is 1D, let Horace know if you run into this")
    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
    M, K = a.shape
    K, N = b.shape
    dtype = a.dtype
    # Allocates output.
    c = torch.empty((M, N), device=a.device, dtype=dtype)
    # 1D launch kernel where each block gets its own program.
    def grid(META):
        return (min(
            NUM_SMS,
            triton.cdiv(M, META["BLOCK_SIZE_M"]) *
            triton.cdiv(N, META["BLOCK_SIZE_N"])), )
    configs = {
        torch.bfloat16: {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 128,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE_M": 8,
            "num_stages": 3,
            "num_warps": 8,
        },
        torch.float16: {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 256,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE_M": 8,
            "num_stages": 3,
            "num_warps": 8,
        },
        torch.float32: {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 128,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE_M": 8,
            "num_stages": 3,
            "num_warps": 8,
        },
    }
    # print(a.device, b.device, c.device)
    matmul_kernel_persistent[grid](
        a,
        b,
        c,  #
        bias,
        M,
        N,
        K,  #
        a.stride(0),
        a.stride(1),  #
        b.stride(0),
        b.stride(1),  #
        c.stride(0),
        c.stride(1),  #
        NUM_SMS=NUM_SMS,  #
        A_LARGE=a.numel() > 2**31,
        B_LARGE=b.numel() > 2**31,
        C_LARGE=c.numel() > 2**31,
        HAS_BIAS=bias is not None,
        **configs[dtype],
    )
    return c
@triton.jit
 def _log_softmax_kernel(
    input_ptr,
    output_ptr,
    input_row_stride,
    output_row_stride,
    n_cols,
    BLOCK_SIZE: tl.constexpr,
 ):
    """
    Compute log_softmax along the last dimension of a 2D tensor.
    Each block handles one row of the input tensor.
    """
    # Get the row index for this block
    row_idx = tl.program_id(0).to(tl.int64)
    # Compute base pointers for input and output rows
    row_start_ptr = input_ptr + row_idx * input_row_stride
    output_row_start_ptr = output_ptr + row_idx * output_row_stride
    # Step 1: Find maximum value in the row for numerical stability
    max_val = -float("inf")
    for col_offset in range(0, n_cols, BLOCK_SIZE):
        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
        mask = col_idx < n_cols
        # Load values
        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=-float("inf"))
        # Update maximum
        max_val = tl.max(tl.maximum(vals, max_val))
    # Step 2: Compute sum of exp(x - max_val)
    sum_exp = 0.0
    for col_offset in range(0, n_cols, BLOCK_SIZE):
        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
        mask = col_idx < n_cols
        # Load values
        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0)
        # Compute exp(x - max_val) and accumulate
        exp_vals = tl.exp(vals - max_val)
        sum_exp += tl.sum(tl.where(mask, exp_vals, 0.0))
    # Compute log(sum_exp)
    log_sum_exp = tl.log(sum_exp)
    # Step 3: Compute final log_softmax values: x - max_val - log_sum_exp
    for col_offset in range(0, n_cols, BLOCK_SIZE):
        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
        mask = col_idx < n_cols
        # Load values
        vals = tl.load(row_start_ptr + col_idx, mask=mask)
        # Compute log_softmax
        output = vals - max_val - log_sum_exp
        # Store results
        tl.store(output_row_start_ptr + col_idx, output, mask=mask)
 def log_softmax(input: torch.Tensor, dim: int = -1) -> torch.Tensor:
    """
    Compute log_softmax using Triton kernel.
    Args:
        input: Input tensor
        dim: Dimension along which to compute log_softmax
             (only -1 or last dim supported)
    >> Stashed changes
    Returns:
        Tensor with log_softmax applied along the specified dimension
    """
    if dim != -1 and dim != input.ndim - 1:
        raise ValueError("This implementation only supports log_softmax along "
                         "the last dimension")
    # Flatten all dimensions except the last one
    original_shape = input.shape
    input_2d = input.reshape(-1, input.shape[-1])
    input_2d = input_2d.contiguous()
    n_rows, n_cols = input_2d.shape
    # Allocate output tensor
    output = torch.empty_like(input_2d)
    # Choose block size based on the number of columns
    BLOCK_SIZE = 1024
    # Launch kernel with one block per row
    grid = (n_rows, )
    _log_softmax_kernel[grid](
        input_2d,
        output,
        input_2d.stride(0),
        output.stride(0),
        n_cols,
        BLOCK_SIZE=BLOCK_SIZE,
    )
    # Reshape output back to original shape
    return output.reshape(original_shape)
@triton.jit
 def mean_kernel(
    input_ptr,
    output_ptr,
    input_stride0,
    input_stride1,
    input_stride2,
    output_stride0,
    output_stride1,
    M,  # size before reduction dim
    N,  # size of reduction dim
    K,  # size after reduction dim
    BLOCK_SIZE: tl.constexpr,
 ):
    """
    Kernel for computing mean along a single dimension.
    Input is viewed as (M, N, K) where N is the dimension being reduced.
    """
    # Program ID gives us which output element we're computing
    pid = tl.program_id(0)
    # Compute output indices
    m_idx = pid // K
    k_idx = pid % K
    # Bounds check
    if m_idx >= M or k_idx >= K:
        return
    # Accumulate sum across reduction dimension
    acc = 0.0
    for n_start in range(0, N, BLOCK_SIZE):
        n_offsets = n_start + tl.arange(0, BLOCK_SIZE)
        mask = n_offsets < N
        # Calculate input indices
        input_idx = m_idx * input_stride0 + n_offsets * input_stride1 \
            + k_idx * input_stride2
        # Load and accumulate
        vals = tl.load(input_ptr + input_idx, mask=mask, other=0.0)
        acc += tl.sum(vals)
    # Compute mean and store
    mean_val = acc / N
    output_idx = m_idx * output_stride0 + k_idx * output_stride1
    tl.store(output_ptr + output_idx, mean_val)
 def mean_dim(input: torch.Tensor,
             dim: int,
             keepdim: bool = False,
             dtype: Union[torch.dtype, None] = None) -> torch.Tensor:
    """
    Triton implementation of torch.mean with single dimension reduction.
    Args:
        input: Input tensor
        dim: Single dimension along which to compute mean
        keepdim: Whether to keep the reduced dimension
        dtype: Output dtype. If None, uses input dtype
               (or float32 for integer inputs)
    Returns:
        Tensor with mean values along specified dimension
    """
    # Validate inputs
    assert input.is_cuda, "Input must be a CUDA tensor"
    assert -input.ndim <= dim < input.ndim, (
        f"Invalid dimension {dim} for tensor with {input.ndim} dimensions")
    # Handle negative dim
    if dim < 0:
        dim = dim + input.ndim
    # Handle dtype
    if dtype is None:
        if input.dtype in [torch.int8, torch.int16, torch.int32, torch.int64]:
            dtype = torch.float32
        else:
            dtype = input.dtype
    # Convert input to appropriate dtype if needed
    if input.dtype != dtype:
        input = input.to(dtype)
    # Get input shape and strides
    shape = list(input.shape)
    # Calculate dimensions for kernel
    M = 1
    for i in range(dim):
        M *= shape[i]
    N = shape[dim]
    K = 1
    for i in range(dim + 1, len(shape)):
        K *= shape[i]
    # Reshape input to 3D view (M, N, K)
    input_3d = input.reshape(M, N, K)
    # Create output shape
    if keepdim:
        output_shape = shape.copy()
        output_shape[dim] = 1
    else:
        output_shape = shape[:dim] + shape[dim + 1:]
    # Create output tensor
    output = torch.empty(output_shape, dtype=dtype, device=input.device)
    # Reshape output for kernel
    if keepdim:
        output_2d = output.reshape(M, 1, K).squeeze(1)
    else:
        output_2d = output.reshape(M, K)
    # Launch kernel
    grid = (M * K, )
    BLOCK_SIZE = 1024
    mean_kernel[grid](
        input_3d,
        output_2d,
        input_3d.stride(0),
        input_3d.stride(1),
        input_3d.stride(2),
        output_2d.stride(0),
        output_2d.stride(1) if output_2d.ndim > 1 else 0,
        M,
        N,
        K,
        BLOCK_SIZE,
    )
    return output
 def mm_batch_invariant(a, b):
    return matmul_persistent(a, b)
 def addmm_batch_invariant(bias, a, b):
    return matmul_persistent(a, b, bias=bias)
 def _log_softmax_batch_invariant(input, dim, _half_to_float):
    assert not _half_to_float, "not implemented"
    return log_softmax(input, dim=dim)
 def mean_batch_invariant(input,
                         dim,
                         keepdim=False,
                         dtype: Union[torch.dtype, None] = None):
    assert dtype is None or dtype == torch.float32, \
        f"unsupported dtype: {dtype}"
    result = input.to(torch.float32)
    # Sort dimensions to reduce from largest to smallest to handle shifting dims
    # during iterative reduction.
    sorted_dims = sorted([d % input.ndim for d in dim], reverse=True)
    # Iteratively apply a deterministic mean.
    for d in sorted_dims:
        result = mean_dim(result, dim=d, keepdim=True)
    if not keepdim:
        # Squeeze the reduced dimensions.
        for d in sorted_dims:
            result = result.squeeze(d)
    return result
 _batch_invariant_MODE = False
 _batch_invariant_LIB = None
 def is_batch_invariant_mode_enabled():
    return _batch_invariant_MODE
 def enable_batch_invariant_mode():
    global _batch_invariant_MODE, _batch_invariant_LIB
    if _batch_invariant_MODE:
        return
    _batch_invariant_MODE = True
    _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
    _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
    _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
    _batch_invariant_LIB.impl("aten::_log_softmax",
                              _log_softmax_batch_invariant, "CUDA")
    _batch_invariant_LIB.impl("aten::mean.dim", mean_batch_invariant, "CUDA")
 def disable_batch_invariant_mode():
    global _batch_invariant_MODE, _batch_invariant_LIB
    if _batch_invariant_LIB is not None:
        _batch_invariant_LIB._destroy()
    _batch_invariant_MODE = False
    _batch_invariant_LIB = None
@contextlib.contextmanager
 def set_batch_invariant_mode(enabled: bool = True):
    global _batch_invariant_MODE, _batch_invariant_LIB
    old_data = (_batch_invariant_MODE, _batch_invariant_LIB)
    if enabled:
        enable_batch_invariant_mode()
    else:
        disable_batch_invariant_mode()
    yield
    if _batch_invariant_LIB is not None:
        _batch_invariant_LIB._destroy()
    _batch_invariant_MODE, _batch_invariant_LIB = old_data
 AttentionBlockSize = namedtuple("AttentionBlockSize", ["block_m", "block_n"])
 def get_batch_invariant_attention_block_size() -> AttentionBlockSize:
    return AttentionBlockSize(block_m=16, block_n=16)
 def vllm_kernel_override_batch_invariant():
    env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT"
    is_overridden = False
    val = os.getenv(env_key, "0")
    try:
        is_overridden = int(val) != 0
    except ValueError:
        is_overridden = False
    return is_overridden
 def init_batch_invariance():
    # this will hit all the csrc overrides as well
    if vllm_kernel_override_batch_invariant():
        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
        enable_batch_invariant_mode()
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@ -40,6 +40,8 @@ def flashinfer_fused_moe_blockscale_fp8(
    assert global_num_experts % 4 == 0
    assert top_k < (topk_group * global_num_experts / num_expert_group)
    assert block_shape == [128, 128]
    # Routing kernel expects #experts <= #threads 256
    assert global_num_experts <= 256
    a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1])
    # NOTE: scales of hidden states have to be transposed!
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -3,6 +3,7 @@
 from abc import abstractmethod
 from collections.abc import Iterable
 from contextlib import nullcontext
 from enum import Enum
 from typing import Callable, Literal, Optional, Union, get_args, overload
@ -983,8 +984,7 @@ class FusedMoE(CustomOp):
                    if dp_size is not None else get_dp_group().world_size)
        self.is_sequence_parallel = is_sequence_parallel
-        if self.is_sequence_parallel:
+        self.sp_size = tp_size_ if is_sequence_parallel else 1
            self.sp_size = tp_size_
        self.moe_parallel_config: FusedMoEParallelConfig = (
            FusedMoEParallelConfig.make(
@ -1966,7 +1966,8 @@ class FusedMoE(CustomOp):
            # clamp start and end
            chunk_start = min(chunk_start, num_tokens - 1)
            chunk_end = min(chunk_end, num_tokens)
-            with ctx.dp_metadata.chunked_sizes(moe_dp_chunk_size_per_rank,
+            with ctx.dp_metadata.chunked_sizes(self.sp_size,
                                               moe_dp_chunk_size_per_rank,
                                               chunk_idx):
                process_chunk(chunk_start,
                              chunk_end,
@ -2011,65 +2012,73 @@ class FusedMoE(CustomOp):
        else:
            shared_output = None
-        if do_naive_dispatch_combine:
+        ctx = get_forward_context()
-            hidden_states, router_logits = get_ep_group().dispatch(
+        sp_ctx = ctx.dp_metadata.sp_local_sizes(
-                hidden_states, router_logits)
+            self.sp_size) if ctx.dp_metadata else nullcontext()
-        # Matrix multiply.
+        with sp_ctx:
-        final_hidden_states = self.quant_method.apply(
+            if do_naive_dispatch_combine:
-            layer=self,
+                hidden_states, router_logits = get_ep_group().dispatch(
-            x=hidden_states,
+                    hidden_states, router_logits, self.is_sequence_parallel)
            router_logits=router_logits,
            top_k=self.top_k,
            renormalize=self.renormalize,
            use_grouped_topk=self.use_grouped_topk,
            global_num_experts=self.global_num_experts,
            expert_map=self.expert_map,
            topk_group=self.topk_group,
            num_expert_group=self.num_expert_group,
            custom_routing_function=self.custom_routing_function,
            scoring_func=self.scoring_func,
            routed_scaling_factor=self.routed_scaling_factor,
            e_score_correction_bias=self.e_score_correction_bias,
            activation=self.activation,
            apply_router_weight_on_input=self.apply_router_weight_on_input,
            enable_eplb=self.enable_eplb,
            expert_load_view=self.expert_load_view,
            logical_to_physical_map=self.logical_to_physical_map,
            logical_replica_count=self.logical_replica_count,
        )
-        if shared_output is not None:
+            # Matrix multiply.
-            assert not isinstance(final_hidden_states, tuple)
+            final_hidden_states = self.quant_method.apply(
-            assert self.shared_experts is not None
+                layer=self,
-            final_hidden_states = (
+                x=hidden_states,
-                shared_output,
+                router_logits=router_logits,
-                final_hidden_states,
+                top_k=self.top_k,
                renormalize=self.renormalize,
                use_grouped_topk=self.use_grouped_topk,
                global_num_experts=self.global_num_experts,
                expert_map=self.expert_map,
                topk_group=self.topk_group,
                num_expert_group=self.num_expert_group,
                custom_routing_function=self.custom_routing_function,
                scoring_func=self.scoring_func,
                routed_scaling_factor=self.routed_scaling_factor,
                e_score_correction_bias=self.e_score_correction_bias,
                activation=self.activation,
                apply_router_weight_on_input=self.apply_router_weight_on_input,
                enable_eplb=self.enable_eplb,
                expert_load_view=self.expert_load_view,
                logical_to_physical_map=self.logical_to_physical_map,
                logical_replica_count=self.logical_replica_count,
            )
        elif self.zero_expert_num is not None and self.zero_expert_num > 0:
            assert isinstance(final_hidden_states, tuple)
            final_hidden_states, zero_expert_result = final_hidden_states
-        def reduce_output(states: torch.Tensor,
+            if shared_output is not None:
-                          do_combine: bool = True) -> torch.Tensor:
+                assert not isinstance(final_hidden_states, tuple)
-            if do_naive_dispatch_combine and do_combine:
+                assert self.shared_experts is not None
-                states = get_ep_group().combine(states)
+                final_hidden_states = (
                    shared_output,
                    final_hidden_states,
                )
            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
                assert isinstance(final_hidden_states, tuple)
                final_hidden_states, zero_expert_result = final_hidden_states
-            if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+            def reduce_output(states: torch.Tensor,
-                states = self.maybe_all_reduce_tensor_model_parallel(states)
+                              do_combine: bool = True) -> torch.Tensor:
                if do_naive_dispatch_combine and do_combine:
                    states = get_ep_group().combine(states,
                                                    self.is_sequence_parallel)
-            return states
+                if (not self.is_sequence_parallel and self.reduce_results
                        and (self.tp_size > 1 or self.ep_size > 1)):
                    states = self.maybe_all_reduce_tensor_model_parallel(
                        states)
-        if self.shared_experts is not None:
+                return states
-            return (
+
-                reduce_output(final_hidden_states[0], do_combine=False),
+            if self.shared_experts is not None:
-                reduce_output(final_hidden_states[1]),
+                return (
-            )
+                    reduce_output(final_hidden_states[0], do_combine=False),
-        elif self.zero_expert_num is not None and self.zero_expert_num > 0:
+                    reduce_output(final_hidden_states[1]),
-            assert isinstance(final_hidden_states, torch.Tensor)
+                )
-            return reduce_output(final_hidden_states) + zero_expert_result
+            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-        else:
+                assert isinstance(final_hidden_states, torch.Tensor)
-            return reduce_output(final_hidden_states)
+                return reduce_output(final_hidden_states) + zero_expert_result
            else:
                return reduce_output(final_hidden_states)
    @classmethod
    def make_expert_params_mapping(
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -639,6 +639,19 @@ def runai_safetensors_weights_iterator(
        yield from tensor_iter
 def _init_loader(
    pg: torch.distributed.ProcessGroup,
    device: torch.device,
    f_list: list[str],
    *,
    nogds: bool = False,
 ):
    loader = SafeTensorsFileLoader(pg, device, nogds=nogds)
    rank_file_map = {i: [f] for i, f in enumerate(f_list)}
    loader.add_filenames(rank_file_map)
    return loader
 def fastsafetensors_weights_iterator(
    hf_weights_files: list[str],
    use_tqdm_on_load: bool,
@ -656,17 +669,31 @@ def fastsafetensors_weights_iterator(
        for i in range(0, len(hf_weights_files), pg.size())
    ]
    nogds = False
    for f_list in tqdm(
            weight_files_sub_lists,
            desc="Loading safetensors using Fastsafetensor loader",
            disable=not enable_tqdm(use_tqdm_on_load),
            bar_format=_BAR_FORMAT,
    ):
-        loader = SafeTensorsFileLoader(pg, device)
+        loader = _init_loader(pg, device, f_list, nogds=nogds)
        rank_file_map = {i: [f] for i, f in enumerate(f_list)}
        loader.add_filenames(rank_file_map)
        try:
-            fb = loader.copy_files_to_device()
+            try:
                fb = loader.copy_files_to_device()
            except RuntimeError as e:
                if "gds" not in str(e):
                    raise
                loader.close()
                nogds = True
                logger.warning_once(
                    "GDS not enabled, setting `nogds=True`.\n"
                    "For more information, see: https://github.com/foundation-model-stack/fastsafetensors?tab=readme-ov-file#basic-api-usages"
                )
                loader = _init_loader(pg, device, f_list, nogds=nogds)
                fb = loader.copy_files_to_device()
            try:
                keys = list(fb.key_to_rank_lidx.keys())
                for k in keys:
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@ -9,7 +9,7 @@ from transformers import AriaConfig, AriaTextConfig, BatchFeature
 from transformers.models.aria.modeling_aria import AriaCrossAttention
 from transformers.models.aria.processing_aria import AriaProcessor
-from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
+from vllm.config import QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
@ -38,8 +38,7 @@ from .idefics2_vision_model import (
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
 from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    is_pp_missing_parameter, maybe_prefix,
+                    is_pp_missing_parameter, maybe_prefix)
                    merge_multimodal_embeddings)
 class AriaImagePixelInputs(TensorSchema):
@ -298,14 +297,12 @@ class AriaTextDecoderLayer(LlamaDecoderLayer):
    Experts (MoE) Layer.
    """
-    def __init__(
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
-        self,
+        super().__init__(vllm_config, prefix)
-        config: AriaTextConfig,
+
-        cache_config: Optional[CacheConfig] = None,
+        config = vllm_config.model_config.hf_config
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config = vllm_config.quant_config
-        prefix: str = "",
+
    ) -> None:
        super().__init__(config, cache_config, quant_config, prefix)
        self.mlp = AriaTextMoELayer(config,
                                    quant_config=quant_config,
                                    prefix=f"{prefix}.mlp")
@ -605,19 +602,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
        multimodal_embeddings = self._process_image_input(image_input)
        return multimodal_embeddings
    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids, inputs_embeds, multimodal_embeddings,
                self.config.image_token_index)
        return inputs_embeds
    def forward(
        self,
        input_ids: torch.Tensor,
@ -628,10 +612,11 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if inputs_embeds is None:
            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            # always pass the input via `inputs_embeds`
+            inputs_embeds = self.get_input_embeddings(
-            # to make sure the computation graph is consistent
+                input_ids,
-            inputs_embeds = self.get_input_embeddings(input_ids,
+                multimodal_embeddings,
-                                                      multimodal_embeddings)
+                is_multimodal=input_ids == self.config.image_token_index,
            )
            input_ids = None
        hidden_states = self.language_model(
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@ -33,8 +33,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
+                    init_vllm_registered_model, maybe_prefix)
                    merge_multimodal_embeddings)
 class AyaVisionImagePixelInputs(TensorSchema):
@ -417,23 +416,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
        return self._process_image_input(image_input, **kwargs)
    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids=input_ids,
                inputs_embeds=inputs_embeds,
                multimodal_embeddings=multimodal_embeddings,
                placeholder_token_id=self.config.image_token_index,
            )
        return inputs_embeds
    def forward(
        self,
        input_ids: torch.Tensor,
@ -449,8 +431,11 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
        # condition is for v0 compatibility.
        elif inputs_embeds is None:
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
+            inputs_embeds = self.get_input_embeddings(
-                                                      vision_embeddings)
+                input_ids,
                vision_embeddings,
                is_multimodal=input_ids == self.config.image_token_index,
            )
            input_ids = None
        hidden_states = self.language_model.model(
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@ -348,6 +348,9 @@ class BertModel(nn.Module, SupportsQuant):
        self.encoder = BertEncoder(vllm_config=vllm_config,
                                   prefix=f"{prefix}.encoder")
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embeddings(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
@ -457,6 +460,9 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
                                       prefix=maybe_prefix(prefix, "model"))
        self.pooler = self._build_pooler(pooler_config)
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
@ -588,6 +594,9 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
            ),
        })
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.bert.get_input_embeddings(input_ids)
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        loader = AutoWeightsLoader(self)
        loaded_params = loader.load_weights(weights)
@ -637,6 +646,9 @@ class BertForTokenClassification(nn.Module):
            Pooler.for_encode(pooler_config),
        })
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.bert.get_input_embeddings(input_ids)
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        loader = AutoWeightsLoader(self)
        loaded_params = loader.load_weights(weights)
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@ -426,6 +426,9 @@ class BertWithRope(nn.Module, SupportsQuant):
            prefix=f"{prefix}.encoder")
        self.pooler = BertPooler(self.config) if add_pooling_layer else None
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embeddings(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
@ -673,6 +676,9 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
        loaded_params = loader.load_weights(weights)
        return loaded_params
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.new.get_input_embeddings(input_ids)
    def forward(
        self,
        input_ids: Optional[torch.Tensor],
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@ -27,7 +27,7 @@ from .blip import BlipVisionModel
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
                         SupportsQuant)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+                    maybe_prefix)
 # We use this internally as placeholders since there is no image token
 # defined on the HuggingFace repo
@ -631,19 +631,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings
    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids, inputs_embeds, multimodal_embeddings,
                _IMAGE_TOKEN_ID)
        return inputs_embeds
    def forward(
        self,
        input_ids: torch.Tensor,
@ -689,8 +676,11 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        # condition is for v0 compatibility.
        elif inputs_embeds is None:
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
+            inputs_embeds = self.get_input_embeddings(
-                                                      vision_embeddings)
+                input_ids,
                vision_embeddings,
                is_multimodal=input_ids == _IMAGE_TOKEN_ID,
            )
            input_ids = None
        hidden_states = self.language_model.model(input_ids,
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@ -44,7 +44,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
                         SupportsQuant)
 from .utils import (flatten_bn, is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix, merge_multimodal_embeddings)
+                    maybe_prefix)
 logger = init_logger(__name__)
@ -1002,20 +1002,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
        vision_embeddings = self.model.get_input_embeddings(image_tokens)
        return vision_embeddings
    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids, inputs_embeds, multimodal_embeddings,
                self.model.vocabulary_mapping.image_token_id)
        return inputs_embeds
    def forward(
        self,
        input_ids: torch.Tensor,
@ -1032,8 +1018,12 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
        # condition is for v0 compatibility.
        elif inputs_embeds is None:
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
+            image_token_id = self.model.vocabulary_mapping.image_token_id
-                                                      vision_embeddings)
+            inputs_embeds = self.get_input_embeddings(
                input_ids,
                vision_embeddings,
                is_multimodal=input_ids == image_token_id,
            )
            input_ids = None
        hidden_states = self.model(input_ids,
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@ -433,6 +433,9 @@ class ChatGLMBaseModel(nn.Module):
        self.make_empty_intermediate_tensors = (
            self.transformer.make_empty_intermediate_tensors)
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.transformer.get_input_embeddings(input_ids)
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@ -37,8 +37,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
+                    init_vllm_registered_model, maybe_prefix)
                    merge_multimodal_embeddings)
 class Cohere2VisionImagePixelInputs(TensorSchema):
@ -430,23 +429,6 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
        return self._process_image_input(image_input, **kwargs)
    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids=input_ids,
                inputs_embeds=inputs_embeds,
                multimodal_embeddings=multimodal_embeddings,
                placeholder_token_id=self.config.image_token_id,
            )
        return inputs_embeds
    def forward(
        self,
        input_ids: torch.Tensor,
@ -462,8 +444,11 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
        # condition is for v0 compatibility.
        elif inputs_embeds is None:
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
+            inputs_embeds = self.get_input_embeddings(
-                                                      vision_embeddings)
+                input_ids,
                vision_embeddings,
                is_multimodal=input_ids == self.config.image_token_id,
            )
            input_ids = None
        hidden_states = self.language_model.model(
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@ -66,6 +66,9 @@ class DeepseekV2Model(nn.Module):
        self.norm = RMSNorm(self.config.hidden_size,
                            eps=self.config.rms_norm_eps)
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embed_tokens(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
@ -205,6 +208,9 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
        self.logits_processor = LogitsProcessor(self.config.vocab_size,
                                                scale=logit_scale)
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@ -101,6 +101,9 @@ class DeepSeekMultiTokenPredictor(nn.Module):
        )
        self.logits_processor = LogitsProcessor(config.vocab_size)
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embed_tokens(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
@ -142,6 +145,9 @@ class DeepSeekMTP(nn.Module, SupportsPP):
                                                 prefix=maybe_prefix(
                                                     prefix, "model"))
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@ -32,7 +32,6 @@ import torch
 from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 import vllm.envs as envs
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
@ -56,8 +55,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.sequence import IntermediateTensors
 from vllm.utils import cdiv, direct_register_custom_op
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
@ -108,43 +107,6 @@ class DeepseekV2MLP(nn.Module):
        return x
 # Chunk x along the num_tokens axis for sequence parallelism
 # NOTE: This is wrapped in a torch custom op to work around the following issue:
 # The output tensor can have a sequence length 0 at small input sequence lengths
 # even though we explicitly pad to avoid this.
 def sequence_parallel_chunk(x: torch.Tensor) -> torch.Tensor:
    tp_size = get_tensor_model_parallel_world_size()
    tp_rank = get_tensor_model_parallel_rank()
    # all_gather needs the sequence length to be divisible by tp_size
    seq_len = x.size(0)
    remainder = seq_len % tp_size
    if remainder != 0:
        pad_len = tp_size - remainder
        x = nn.functional.pad(x, (0, 0, 0, pad_len))
    chunk = x.shape[0] // tp_size
    start = tp_rank * chunk
    return torch.narrow(x, 0, start, chunk)
 def sequence_parallel_chunk_fake(x: torch.Tensor) -> torch.Tensor:
    tp_size = get_tensor_model_parallel_world_size()
    seq_len = cdiv(x.size(0), tp_size)
    shape = list(x.shape)
    shape[0] = seq_len
    out = torch.empty(shape, dtype=x.dtype, device=x.device)
    return out
 direct_register_custom_op(
    op_name="sequence_parallel_chunk",
    op_func=sequence_parallel_chunk,
    fake_impl=sequence_parallel_chunk_fake,
    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 class DeepseekV2MoE(nn.Module):
    def __init__(
@ -166,20 +128,7 @@ class DeepseekV2MoE(nn.Module):
        self.n_routed_experts: int = config.n_routed_experts
        self.n_shared_experts: int = config.n_shared_experts
-        # The all_reduce at the end of attention (during o_proj) means that
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
        # inputs are replicated across each rank of the tensor parallel group.
        # If using expert-parallelism with DeepEP All2All ops, replicated
        # tokens results in useless duplicate computation and communication.
        #
        # In this case, ensure the input to the experts is sequence parallel
        # to avoid the excess work.
        #
        # Not needed for pplx-kernels as it can handle duplicate input tokens.
        self.is_sequence_parallel = (envs.VLLM_ALL2ALL_BACKEND
                                     in ("deepep_high_throughput",
                                         "deepep_low_latency")
                                     and parallel_config.enable_expert_parallel
                                     and self.tp_size > 1)
        if config.hidden_act != "silu":
            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
@ -278,8 +227,7 @@ class DeepseekV2MoE(nn.Module):
        # TODO: We can replace the all_reduce at the end of attn with a
        # reduce_scatter instead of chunking here.
        if self.is_sequence_parallel:
-            hidden_states = torch.ops.vllm.sequence_parallel_chunk(
+            hidden_states = sequence_parallel_chunk(hidden_states)
                hidden_states)
        # router_logits: (num_tokens, n_experts)
        router_logits, _ = self.gate(hidden_states)
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@ -41,8 +41,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model, maybe_prefix,
+                    init_vllm_registered_model, maybe_prefix)
                    merge_multimodal_embeddings)
 # The image token id may be various
 _IMAGE_TOKEN = "<image>"
@ -346,7 +345,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        model_config = vllm_config.model_config
        tokenizer = cached_tokenizer_from_config(model_config)
-        self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
+        self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
        self.vision = self._init_vision_module(self.vision_config,
                                               quant_config,
@ -605,19 +604,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings
    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids, inputs_embeds, multimodal_embeddings,
                self.image_token_id)
        return inputs_embeds
    def forward(self,
                input_ids: torch.Tensor,
                positions: torch.Tensor,
@ -632,8 +618,11 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        # condition is for v0 compatibility
        elif inputs_embeds is None:
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
+            inputs_embeds = self.get_input_embeddings(
-                                                      vision_embeddings)
+                input_ids,
                vision_embeddings,
                is_multimodal=input_ids == self.image_token_id,
            )
            input_ids = None
        hidden_states = self.language_model(input_ids,
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@ -34,8 +34,7 @@ from vllm.model_executor.models.qwen2_vl import (Qwen2VLDummyInputsBuilder,
                                                 Qwen2VLProcessingInfo)
 from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
                                              init_vllm_registered_model,
-                                              maybe_prefix,
+                                              maybe_prefix)
                                              merge_multimodal_embeddings)
 from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict
@ -796,33 +795,17 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
    def get_language_model(self) -> torch.nn.Module:
        return self.language_model
-    def get_multimodal_embeddings(
+    def get_multimodal_embeddings(self,
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+                                  **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return []
        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings
    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids,
                inputs_embeds,
                multimodal_embeddings,
                self.config.image_token_id,
            )
        return inputs_embeds
    def forward(
        self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
@ -830,17 +813,14 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if intermediate_tensors is not None:
            inputs_embeds = None
-        elif inputs_embeds is None and kwargs.get("pixel_values") is not None:
+        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            if image_input is None:
+            inputs_embeds = self.get_input_embeddings(
-                inputs_embeds = None
+                input_ids,
-            else:
+                vision_embeddings,
-                assert input_ids is not None
+                is_multimodal=input_ids == self.config.image_token_id,
-                inputs_embeds = self.get_multimodal_embeddings(
+            )
-                    input_ids,
+            input_ids = None
                    image_input=image_input,
                )
                input_ids = None
        hidden_states = self.language_model(
            input_ids=input_ids,
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@ -60,8 +60,7 @@ from vllm.sequence import IntermediateTensors
 from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                         SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix,
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
                    merge_multimodal_embeddings)
 from .vision import get_vit_attn_backend
 logger = init_logger(__name__)
@ -1467,18 +1466,24 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
        *,
        is_multimodal: Optional[torch.Tensor] = None,
        handle_oov_mm_token: bool = False,
    ) -> torch.Tensor:
        if multimodal_embeddings is not None and len(
                multimodal_embeddings) > 0:
            self._set_visual_token_mask(input_ids)
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        # This is to satisfy the type checker for each overload
        if multimodal_embeddings is None or is_multimodal is None:
            return super().get_input_embeddings(input_ids)
-        if multimodal_embeddings is None:
+        return super().get_input_embeddings(
-            return inputs_embeds
+            input_ids,
-
+            multimodal_embeddings=multimodal_embeddings,
-        self._set_visual_token_mask(input_ids)
+            is_multimodal=is_multimodal,
-        inputs_embeds = merge_multimodal_embeddings(input_ids, inputs_embeds,
+            handle_oov_mm_token=handle_oov_mm_token,
-                                                    multimodal_embeddings,
+        )
                                                    [self.config.im_patch_id])
        return inputs_embeds
    def forward(
        self,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Robert Shaw	562107efb1	Merge branch 'main' into fix_hang	2025-09-27 10:44:26 -04:00
Tyler Michael Smith	a5354b3ed2	[Bugfix][WideEP] Apply TP Attn + EP MoE fix to other models (#24982 ) Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>	2025-09-27 14:22:28 +00:00
Tyler Michael Smith	f9df8b4ad7	[Bugfix] Fix triton import precommit failure (#25803 ) Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>	2025-09-27 07:13:11 -07:00
Harry Mellor	ec152c8748	Fix GPTQ model loading in Transformers backend (#25770 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-27 12:18:20 +00:00
Russell Bryant	7977e5027c	Add filtering for chat template kwargs (#25794 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-27 10:46:49 +00:00
Russell Bryant	3f5d902d2a	Validate API tokens in constant time (#25781 ) Signed-off-by: rentianyue-jk <rentianyue-jk@360shuke.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: rentianyue-jk <rentianyue-jk@360shuke.com>	2025-09-27 18:09:26 +08:00
Cyrus Leung	27d7638b94	[Bugfix] Merge MM embeddings by index instead of token IDs (#16229 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: NickLucche <nlucches@redhat.com> Co-authored-by: Roger Wang <hey@rogerw.io>	2025-09-27 08:15:12 +00:00
Xiaohan Zou	176173989a	[Bugfix] Add missing `image_size` for phi4_multimodal (#25796 )	2025-09-27 07:59:22 +00:00
Roger Wang	23b8ee672d	[Misc] Update openai client example file for multimodal (#25795 ) Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-09-27 07:57:07 +00:00
22quinn	3939152069	[Misc] Fix codeowners override for v1 sample and attention (#25037 ) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-09-27 07:47:29 +00:00
Cyrus Leung	cd87bfbf37	[CI/Build] Reorganize root-level V1 tests (#25767 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-09-27 13:51:15 +08:00
22quinn	b3613e3ace	[CI/Build] Add timing to Model Executor Test (#25799 ) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>	2025-09-26 21:57:27 -07:00
Cyrus Leung	d346ec695e	[CI/Build] Consolidate model loader tests and requirements (#25765 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-09-26 21:45:20 -07:00
Wentao Ye	c242c98031	[Bugfix] Allow Only SDPA Backend for ViT on B200 for Qwen3-VL (#25788 )	2025-09-26 20:44:52 -07:00
WeiQing Chen	f1d53d150c	[Multimodal][Speculative Decoding]Eagle Eagle3 mm support, enablement on qwen2.5vl (#22872 ) Signed-off-by: Junhong <liujunhong11@huawei.com> Signed-off-by: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com> Co-authored-by: Junhong <liujunhong11@huawei.com> Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>	2025-09-27 03:35:47 +00:00
Michael Goin	92da847cf5	Add flashinfer-build.sh and register precompiled cu128 wheel in Dockerfile (#25782 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-09-26 18:54:09 -07:00
Russell Bryant	3958b96bf5	Add option to restrict media domains (#25783 ) Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Chenheli Hua <huachenheli@outlook.com>	2025-09-27 01:23:52 +00:00
Zhuohan Li	8bf8f45822	[Core] Don't count preempted tokens in prefix cache hit rate (#25787 ) Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>	2025-09-27 00:16:40 +00:00
Jonas M. Kübler	6f5c0931c1	[Spec decode] automatically disable mm for text-only draft models (#25667 ) Signed-off-by: Jonas Kuebler <kuebj@amazon.com>	2025-09-27 08:10:21 +08:00
Naman Lalit	4e33a7ea85	[Bugfix] Optimize CpuGpuBuffer initialization (#25447 ) Signed-off-by: Naman Lalit <nl2688@nyu.edu>	2025-09-27 08:07:36 +08:00
Bram Wasti	dc48ba0c75	Kernel-override Determinism [1/n] (#25603 ) Signed-off-by: Bram Wasti <bwasti@meta.com>	2025-09-26 16:59:09 -07:00
Robert Shaw	2a548e8ef1	Merge branch 'main' into fix_hang	2025-09-26 19:11:45 -04:00
Sage Moore	4778b42660	Reduce the Cuda Graph memory footprint when running with DBO (#25779 ) Signed-off-by: Sage Moore <sage@neuralmagic.com>	2025-09-26 22:29:56 +00:00
qizixi	c70ac4b8ff	[spec decode] Consolidate speculative decode method name for MTP (#25232 ) Signed-off-by: zixi-qi <qizixi@meta.com>	2025-09-26 22:27:05 +00:00
Alexander Matveev	a0256414e9	[Bugfix] Fix hang with DP+EP on B200 Signed-off-by: Alexander Matveev <amatveev@redhat.com>	2025-09-26 15:25:58 -07:00
Michael Goin	cf89202855	[CI] Fix FlashInfer AOT in release docker image (#25730 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-09-26 14:11:40 -07:00
fhl2000	f075693da7	[V1] address post issues related to #20059 (part 1) (#23046 ) Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>	2025-09-26 15:58:19 -04:00
Michael Goin	f708bd4904	[CI] Add E2E Blackwell Quantized MoE Test (#25723 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-09-26 12:23:00 -07:00
Michael Goin	0002b7f0d1	[Docs] Add Toronto Meetup (#25773 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-09-26 12:00:46 -07:00
Frank Wang	11aafd9886	[Bugfix] Improve GLM4 MoE Reasoning Parser's is_reasoning_end Condition (#25355 ) Signed-off-by: frankwang28 <frank.wbb@hotmail.com> Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com>	2025-09-26 11:54:00 -07:00