Revert "[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 )"

This reverts commit e202dd2736bc575b11250b15311512d19d3225d5.
[Bugfix] Add use_cross_encoder flag to use correct activation in ClassifierPooler (#20527 )
2025-10-20 23:03:52 +08:00 · 2025-07-06 14:02:36 -07:00 · 2025-07-06 14:01:48 -07:00 · 2025-07-06 19:44:52 +00:00 · 2025-07-06 09:54:36 -07:00 · 2025-07-06 08:48:13 -07:00
58 changed files with 659 additions and 228 deletions
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -107,10 +107,9 @@ fi

 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
-  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_blocksparse_attention.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
-  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -68,7 +68,7 @@ jobs:
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"

      - name: curl test
        run: |
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
            4096,
        ]
    else:
-        batch_sizes = [args.batch_size]
+        batch_sizes = args.batch_size

    use_deep_gemm = bool(args.use_deep_gemm)

@ -728,7 +728,7 @@ if __name__ == "__main__":
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--trust-remote-code", action="store_true")
    parser.add_argument("--model-prefix", type=str, required=False)
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -239,11 +239,6 @@ void cutlass_moe_mm(
    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
    bool per_act_token, bool per_out_ch);

-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets);
-
 void cutlass_fp4_group_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@ -1,3 +1,5 @@
+#include "core/registration.h"
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>

@ -364,4 +366,9 @@ void cutlass_blockwise_scaled_grouped_mm(
    TORCH_CHECK(false, "Unsupported output tensor type");
  }
 #endif
-}
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_blockwise_scaled_grouped_mm",
+         &cutlass_blockwise_scaled_grouped_mm);
+}
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@ -59,6 +59,8 @@ void apply_repetition_penalties_(
  int vocab_size = logits.size(-1);
  int num_seqs = logits.size(0);

+  if (num_seqs == 0) return;
+
  // Get number of SMs on the current device
  int sms = 0;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -399,8 +399,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "Tensor scales_a, Tensor scales_b, "
      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
      {stride_tag});
-  ops.impl("cutlass_blockwise_scaled_grouped_mm", torch::kCUDA,
-           &cutlass_blockwise_scaled_grouped_mm);
+  // conditionally compiled so impl registration is in source file

  // cutlass nvfp4 block scaled group GEMM
  ops.def(
--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@ -14,7 +14,7 @@ Before setting up the incremental build:
    VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
    ```

-2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
+2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.

 3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.

--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis

 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

+If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
+
+```python
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+image_url = "https://picsum.photos/id/32/512/512"
+image_pil = ImageAsset('cherry_blossom').pil_image
+image_embeds = torch.load(...)
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant"},
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    {
+        "role": "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        },{
+            "type": "image_pil",
+            "image_pil": image_pil
+        }, {
+            "type": "image_embeds",
+            "image_embeds": image_embeds
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }],
+    },
+]
+
+# Perform inference and log output.
+outputs = llm.chat(conversation)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

 ??? Code
@ -228,7 +271,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.

-    For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
+    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.

 ### Image Inputs
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -626,9 +626,6 @@ Specified using `--task generate`.
 !!! note
    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.

-!!! note
-    `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
-
 !!! note
    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.

@ -671,11 +668,8 @@ Specified using `--task generate`.
    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.

 !!! note
-    To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
-    `pip install git+https://github.com/huggingface/transformers.git`.
-
-    Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
-    `--mm-processor-kwargs '{"use_audio_in_video": true}'`.
+    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
+    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.

 #### Transcription

--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -6,6 +6,7 @@ import argparse

 from vllm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset

 # This script is an offline demo for running Mistral-Small-3.1
 #
@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
    )

    prompt = "Describe this image in one sentence."
-    image_url = "https://picsum.photos/id/237/200/300"

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
            ],
        },
    ]
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-6.7b",
+        model="Salesforce/blip2-opt-2.7b",
        limit_mm_per_prompt={modality: 1},
    )

@ -971,7 +971,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    )


-# Qwen
+# Qwen-VL
 def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
            }, {
                "type": "text",
                "text": "What's in these images?"
@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
            }, {
                "type": "text",
                "text": "What's in these images?"
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@ -450,7 +450,8 @@ def test_multi_query_kv_attention(
            start += seq_len
        # xformers.AttentionBias to Tensor for use in reference impl.
        alibi_bias = [
-            b.materialize(b.shape, device=device).squeeze() for b in attn_bias
+            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
+            for b, i in zip(attn_bias, seq_lens)
        ]
    else:
        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -171,7 +171,7 @@ def test_env(
                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected
                else:
-                    backend = get_attn_backend(16,
+                    backend = get_attn_backend(32,
                                               torch.float16,
                                               torch.float16,
                                               block_size,
@ -180,6 +180,45 @@ def test_env(
                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected

+                    if use_v1:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        assert backend.get_name() == "FLEX_ATTENTION", (
+                            "Should fallback to FlexAttention if head size is "
+                            "not supported by FlashAttention")
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_fp32_fallback(
+    device: str,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with fp32."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
+                    if use_v1 else "TORCH_SDPA")
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "FLEX_ATTENTION"
+                    if use_v1 else "XFORMERS")
+

 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # DeepGEMM Style Cutlass Grouped GEMM Test
 # See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@ -75,3 +75,51 @@ def test_apply_repetition_penalties(
    # Test the operator by applying the opcheck utility
    opcheck(torch.ops._C.apply_repetition_penalties_,
            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties_zero_seqs() -> None:
+    """
+    Test the apply_repetition_penalties custom op with num_seqs=0
+    against a reference implementation.
+    """
+    num_seqs = 0
+    vocab_size = 17
+    repetition_penalty = 1.05
+    dtype = torch.float32
+    seed = 0
+
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # No tokens to mark as repeated since num_seqs=0
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
           model_runner.model)


-@pytest.fixture(params=[True, False])
-def run_with_both_engines_lora(request, monkeypatch):
-    # Automatically runs tests twice, once with V1 and once without
-    use_v1 = request.param
-    # Tests decorated with `@skip_v1` are only run without v1
-    skip_v1 = request.node.get_closest_marker("skip_v1")
-
-    if use_v1:
-        if skip_v1:
-            pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
-    else:
-        monkeypatch.setenv('VLLM_USE_V1', '0')
-
-    yield
-
-
@pytest.fixture
 def reset_default_device():
    """
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -33,9 +33,6 @@ if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

 REQUIRES_V0_MODELS = [
-    # V1 Test: no way to fall back for head_dim = 80
-    # https://github.com/vllm-project/vllm/issues/14524
-    "qwen_vl",
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
 ]
@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        # TODO: Change back to 2.7b once head_dim = 80 is supported
-        models=["Salesforce/blip2-opt-6.7b"],
+        models=["Salesforce/blip2-opt-2.7b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
    "h2ovl": VLMTestInfo(
        models = [
            "h2oai/h2ovl-mississippi-800m",
-            # TODO: Re-enable once head_dim = 80 is supported
-            # "h2oai/h2ovl-mississippi-2b",
+            "h2oai/h2ovl-mississippi-2b",
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@ -83,7 +83,7 @@ MODELS = [
    QWEN2_CONFIG,
    PHI3_CONFIG,
    GPT2_CONFIG,
-    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    STABLELM_CONFIG,
    DOLPHIN_CONFIG,
    # STARCODER_CONFIG, # broken
 ]
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                        trust_remote_code=True),
    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
-    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
+    # Blocksparse attention not supported in V1 yet
    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
                                            trust_remote_code=True,
                                            v0_only=True),
@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
-                                                v0_only=True),
-    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
-                                           v0_only=True),
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"},  # noqa: E501
-                                                     v0_only=True),
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True,
-                                                      v0_only=True),
+                                                      trust_remote_code=True),
    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    model_info.check_transformers_version(on_fail="skip")

    # FIXME: Possible memory leak in the previous tests?
-    if model_arch == "GraniteSpeechForConditionalGeneration":
+    if model_arch in ("GraniteSpeechForConditionalGeneration",
+                      "KimiVLForConditionalGeneration"):
        pytest.skip("Avoid OOM")

    # Avoid OOM and reduce initialization time by only using 1 layer
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@ -450,6 +450,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):


 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} must come before the current layer"
@ -478,6 +479,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():


 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    invalid_layer = "model.layers.0.cross_attn.attn"
@ -506,6 +508,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():


 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} cannot be the same as the current layer"
@ -534,6 +537,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():


 def test_init_kv_cache_without_kv_sharing():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
@ -601,6 +605,7 @@ def test_init_kv_cache_without_kv_sharing():


 def test_init_kv_cache_with_kv_sharing_valid():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -310,7 +310,8 @@ class MultiHeadAttention(nn.Module):
            # currently, only torch_sdpa is supported on rocm
            self.attn_backend = _Backend.TORCH_SDPA
        else:
-            if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
+                           _Backend.FLEX_ATTENTION):
                backend = _Backend.XFORMERS

            self.attn_backend = backend if backend in {
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@ -4,7 +4,7 @@
 import os
 from contextlib import contextmanager
 from functools import cache
-from typing import Generator, Optional, Type
+from typing import Generator, Optional, Union

 import torch

@ -79,6 +79,33 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
    return forced_attn_backend


+def supports_head_size(
+    attn_backend: Union[str, type[AttentionBackend]],
+    head_size: int,
+) -> bool:
+    if isinstance(attn_backend, str):
+        try:
+            attn_backend = resolve_obj_by_qualname(attn_backend)
+        except ImportError:
+            return False
+
+    assert isinstance(attn_backend, type)
+
+    # TODO: Update the interface once V0 is removed
+    if get_supported_head_sizes := getattr(attn_backend,
+                                           "get_supported_head_sizes", None):
+        return head_size in get_supported_head_sizes()
+    if validate_head_size := getattr(attn_backend, "validate_head_size", None):
+        try:
+            validate_head_size(head_size)
+            return True
+        except Exception:
+            return False
+
+    raise NotImplementedError(f"{attn_backend.__name__} does not support "
+                              "head size validation")
+
+
 def get_attn_backend(
    head_size: int,
    dtype: torch.dtype,
@ -87,7 +114,7 @@ def get_attn_backend(
    is_attention_free: bool,
    is_blocksparse: bool = False,
    use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""
    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
    # value to be returned from the cache if the value changes between calls.
@ -115,7 +142,7 @@ def _cached_get_attn_backend(
    is_blocksparse: bool = False,
    use_v1: bool = False,
    use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
    if is_blocksparse:
        logger.info("Using BlocksparseFlashAttention backend.")
        from vllm.attention.backends.blocksparse_attn import (
--- a/vllm/config.py
+++ b/vllm/config.py
@ -2319,7 +2319,7 @@ class SchedulerConfig:

        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
            logger.warning(
-                "max_num_batched_tokens (%d) exceeds max_num_seqs"
+                "max_num_batched_tokens (%d) exceeds max_num_seqs "
                "* max_model_len (%d). This may lead to unexpected behavior.",
                self.max_num_batched_tokens,
                self.max_num_seqs * self.max_model_len)
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1393,13 +1393,6 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False

-        # Only Fp16 and Bf16 dtypes since we only support FA.
-        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
-        if model_config.dtype not in V1_SUPPORTED_DTYPES:
-            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
-                               recommend_to_remove=False)
-            return False
-
        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -28,7 +28,8 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
                               ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
    InputAudio)
-from pydantic import TypeAdapter
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                          ProcessorMixin)
@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
    """The type of the content part."""


+class PILImage(BaseModel):
+    """
+    A PIL.Image.Image object.
+    """
+    image_pil: Image.Image
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a PIL image.
+
+    Example:
+    {
+        "image_pil": ImageAsset('cherry_blossom').pil_image
+    }
+    """
+    image_pil: Required[PILImage]
+
+
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.
@ -129,6 +149,7 @@ ChatCompletionContentPartParam: TypeAlias = Union[
    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
    ChatCompletionContentPartInputAudioParam,
    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    CustomChatCompletionContentPILImageParam,
    CustomChatCompletionContentSimpleImageParam,
    ChatCompletionContentPartImageEmbedsParam,
    CustomChatCompletionContentSimpleAudioParam,
@ -631,6 +652,10 @@ class BaseMultiModalContentParser(ABC):
                           image_embeds: Union[str, dict[str, str]]) -> None:
        raise NotImplementedError

+    @abstractmethod
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        raise NotImplementedError
+
    @abstractmethod
    def parse_audio(self, audio_url: str) -> None:
        raise NotImplementedError
@ -677,6 +702,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):

        self._add_placeholder(placeholder)

+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        placeholder = self._tracker.add("image", image_pil)
+        self._add_placeholder(placeholder)
+
    def parse_audio(self, audio_url: str) -> None:
        audio = self._connector.fetch_audio(audio_url)

@ -733,6 +762,13 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
        placeholder = self._tracker.add("image_embeds", future)
        self._add_placeholder(placeholder)

+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        future: asyncio.Future[Image.Image] = asyncio.Future()
+        future.set_result(image_pil)
+
+        placeholder = self._tracker.add("image", future)
+        self._add_placeholder(placeholder)
+
    def parse_audio(self, audio_url: str) -> None:
        audio_coro = self._connector.fetch_audio_async(audio_url)

@ -851,12 +887,13 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
+_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
 # Need to validate url objects
 _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python

-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]

 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@ -869,6 +906,7 @@ MM_PARSER_MAP: dict[
    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
    "image_embeds":
    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
    "audio_url":
    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
    "input_audio":
@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(


 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds",
+                                       "image_embeds", "image_pil",
                                       "audio_url", "input_audio", "video_url")


@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
        else:
            return str_content

+    if part_type == "image_pil":
+        image_content = cast(Image.Image, content)
+        mm_parser.parse_image_pil(image_content)
+        return {'type': 'image'} if wrap_dicts else None
    if part_type == "image_url":
        str_content = cast(str, content)
        mm_parser.parse_image(str_content)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -1204,7 +1204,7 @@ class LLM:

        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]

-        pooling_params = PoolingParams()
+        pooling_params = PoolingParams(use_cross_encoder=True)

        tokenization_kwargs: dict[str, Any] = {}
        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -910,6 +910,8 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 }

 if envs.VLLM_SERVER_DEV_MODE:
+    logger.warning("SECURITY WARNING: Development endpoints are enabled! "
+                   "This should NOT be used in production!")

    @router.get("/server_info")
    async def show_server_info(raw_request: Request):
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -1156,8 +1156,9 @@ class ScoreRequest(OpenAIBaseModel):

    # --8<-- [end:score-extra-params]

-    def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
+                             additional_data=self.additional_data)


 class RerankRequest(OpenAIBaseModel):
@ -1182,8 +1183,9 @@ class RerankRequest(OpenAIBaseModel):

    # --8<-- [end:rerank-extra-params]

-    def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
+                             additional_data=self.additional_data)


 class RerankDocument(BaseModel):
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@ -25,9 +25,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               PreTrainedTokenizer,
-                                               PreTrainedTokenizerFast)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators

 logger = init_logger(__name__)
@ -50,7 +48,7 @@ class ServingScores(OpenAIServing):

    async def _embedding_score(
        self,
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        tokenizer: AnyTokenizer,
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@ -141,7 +139,7 @@ class ServingScores(OpenAIServing):

    async def _cross_encoding_score(
        self,
-        tokenizer: Union[AnyTokenizer],
+        tokenizer: AnyTokenizer,
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@ -190,7 +188,7 @@ class ServingScores(OpenAIServing):
        # Schedule the request and get the result generator.
        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []

-        pooling_params = request.to_pooling_params()
+        pooling_params = request.to_pooling_params(use_cross_encoder=True)

        for i, engine_prompt in enumerate(engine_prompts):
            request_id_item = f"{request_id}-{i}"
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@ -6,6 +6,7 @@ from typing import Union

 import regex as re

+from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@ -15,7 +16,6 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
    ToolParser, ToolParserManager)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import random_uuid

 logger = init_logger(__name__)

@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
                        DeltaToolCall(
                            index=self.current_tool_id,
                            type="function",
-                            id=f"chatcmpl-tool-{random_uuid()}",
+                            id=random_tool_call_id(),
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@ -3,10 +3,11 @@
 # ruff: noqa
 import json
 from collections.abc import Sequence
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union

 import regex as re

+from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@ -225,7 +226,7 @@ class xLAMToolParser(ToolParser):
                        function_name = name_match.group(1)

                        # The test expects us to send just the name first
-                        tool_id = f"chatcmpl-tool-{random_uuid()}"
+                        tool_id = random_tool_call_id()
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=0,
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@ -0,0 +1,147 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
+
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@ -322,7 +322,7 @@ def cutlass_moe_fp8(
    topk_ids: torch.Tensor,
    w1_scale: torch.Tensor,
    w2_scale: torch.Tensor,
-    per_act_token: bool,
+    per_act_token: Optional[bool] = None,
    activation: str = "silu",
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
@ -366,6 +366,9 @@ def cutlass_moe_fp8(
    Returns:
    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
    """
+    if per_act_token is None:
+        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
    per_out_ch = w1_scale.numel() != w1_q.size(0)

    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -1320,8 +1320,13 @@ class FusedMoE(torch.nn.Module):

    def forward(self, hidden_states: torch.Tensor,
                router_logits: torch.Tensor):
-        return torch.ops.vllm.moe_forward(hidden_states, router_logits,
-                                          self.layer_name)
+        # TODO: Once the OOM issue for the TPU backend is resolved, we will
+        # switch to using the moe_forward custom op.
+        if current_platform.is_tpu():
+            return self.forward_impl(hidden_states, router_logits)
+        else:
+            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
+                                              self.layer_name)

    def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                             full_router_logits: torch.Tensor):
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@ -15,6 +15,7 @@ from vllm.model_executor.pooling_metadata import (  # noqa: E501
 from vllm.model_executor.pooling_metadata import PoolingTensors
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.transformers_utils.config import (
+    get_classification_activation_function,
    get_cross_encoder_activation_function)
 from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata

@ -388,15 +389,14 @@ class ClassifierPooler(nn.Module):
        self.classifier = classifier
        self.pooler = pooler

-        if config.task == "score":
-            self.default_activation_function = \
-                get_cross_encoder_activation_function(config.hf_config)
-        elif config.task == "classify":
-            self.default_activation_function = nn.Sigmoid() \
-                if config.hf_config.num_labels == 1 else nn.Softmax()
-        else:
-            raise NotImplementedError(f"task={config.task!r} is not supported"
-                                      " with the classification pooler")
+        self.classification_act_fn = get_classification_activation_function(
+            config.hf_config)
+        self.cross_encoder_act_fn = get_cross_encoder_activation_function(
+            config.hf_config)
+
+    def _get_act_fn(self, use_cross_encoder: bool):
+        return (self.cross_encoder_act_fn
+                if use_cross_encoder else self.classification_act_fn)

    def get_prompt_lens(
        self,
@ -446,8 +446,28 @@ class ClassifierPooler(nn.Module):
            # apply classifier once on the full batch if possible
            pooled_output = self.classifier(pooled_output)

-        # shape: (batch_size, num_labels)
-        scores = self.default_activation_function(pooled_output)
+        if isinstance(pooling_metadata, V0PoolingMetadata):
+            use_cross_encoder_list = [
+                pooling_param.use_cross_encoder
+                for _, pooling_param in pooling_metadata.seq_groups
+            ]
+        else:
+            use_cross_encoder_list = [
+                pooling_param.use_cross_encoder
+                for pooling_param in pooling_metadata.pooling_params
+            ]
+
+        # shape of scores: (batch_size, num_labels)
+        if all(use_cross_encoder == use_cross_encoder_list[0]
+               for use_cross_encoder in use_cross_encoder_list):
+            act_fn = self._get_act_fn(use_cross_encoder_list[0])
+            scores = act_fn(pooled_output)
+        else:
+            scores = torch.stack([
+                self._get_act_fn(use_cross_encoder)(vecs)
+                for use_cross_encoder, vecs in zip(use_cross_encoder_list,
+                                                   pooled_output)
+            ])

        pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
        return PoolerOutput(outputs=pooled_outputs)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@ -368,6 +368,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
            "weights")
        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
            "input_activations")
+        self.topk_indices_dtype = None

        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
                      and self.input_quant.strategy
@ -738,6 +739,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):

        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
            cutlass_moe_fp8)
+        self.topk_indices_dtype = None
        self.fused_experts = cutlass_moe_fp8  # type: ignore
        self.disable_expert_map = False

--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@ -104,8 +104,12 @@ class TensorizerLoader(BaseModelLoader):

        if is_vllm_tensorized(self.tensorizer_config):
            tensorizer_config = self._patch_tensorizer_config(model_config)
-            model = init_tensorizer_model(tensorizer_config=tensorizer_config,
-                                          vllm_config=vllm_config)
+            device_config = vllm_config.device_config
+            with set_default_torch_dtype(model_config.dtype):
+                with torch.device(device_config.device):
+                    model = init_tensorizer_model(
+                        tensorizer_config=tensorizer_config,
+                        vllm_config=vllm_config)
            self.load_weights(model, model_config)
            return model
        return self._load_model_serialized_cpu(vllm_config=vllm_config)
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@ -25,8 +25,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
-from vllm.transformers_utils.config import (
-    get_cross_encoder_activation_function)

 from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
@ -462,9 +460,6 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
        super().__init__()
        config = vllm_config.model_config.hf_config

-        self.default_activation_function = \
-            get_cross_encoder_activation_function(config)
-
        self.num_labels = config.num_labels
        self.bert = BertModel(vllm_config=vllm_config,
                              prefix=maybe_prefix(prefix, "bert"),
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@ -18,8 +18,6 @@ from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
 from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
-from vllm.transformers_utils.config import (
-    get_cross_encoder_activation_function)

 from .bert_with_rope import BertWithRope, JinaRobertaModel
 from .interfaces import SupportsCrossEncoding, SupportsV0Only
@ -178,9 +176,6 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
        super().__init__()
        config = vllm_config.model_config.hf_config

-        self.default_activation_function = \
-            get_cross_encoder_activation_function(config)
-
        self.num_labels = config.num_labels
        self.roberta = BertModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "bert"),
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@ -48,9 +48,10 @@ class MediaConnector:
            media_io_kwargs: Additional args passed to process media 
                             inputs, keyed by modalities. For example, 
                             to set num_frames for video, set 
-                             `--media-io-kwargs '{"video": {"num_frames": 40} }'`
+                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
            connection: HTTP connection client to download media contents.
-            allowed_local_media_path: A local directory to load media files from.
+            allowed_local_media_path: A local directory to load media files
+                                      from.
        """
        super().__init__()

@ -488,4 +489,4 @@ def fetch_video(
        "video": video_io_kwargs
    }
    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
-    return media_connector.fetch_video(video_url)
+    return media_connector.fetch_video(video_url)
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -234,31 +234,44 @@ class CudaPlatformBase(Platform):
                        return ("vllm.attention.backends."
                                "flashmla.FlashMLABackend")
        if use_v1:
+            FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
+            FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+            TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
+            FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+
            if selected_backend == _Backend.FLASHINFER:
                logger.info_once("Using FlashInfer backend on V1 engine.")
-                return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+                return FLASHINFER_V1
            elif selected_backend == _Backend.FLEX_ATTENTION:
-                logger.info("Using FlexAttenion backend on V1 engine.")
-                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+                logger.info_once("Using FlexAttention backend on V1 engine.")
+                return FLEX_ATTENTION_V1
            elif selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                logger.info_once("Using Triton backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "triton_attn.TritonAttentionBackend")
+                return TRITON_ATTN_VLLM_V1
            elif selected_backend == _Backend.FLASH_ATTN:
                logger.info_once("Using Flash Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "flash_attn.FlashAttentionBackend")
+                return FLASH_ATTN_V1
+
+            from vllm.attention.selector import supports_head_size

            # Default backends for V1 engine
+            # FP32 is only supported by FlexAttention
+            if dtype not in (torch.float16, torch.bfloat16):
+                logger.info_once(
+                    "Using FlexAttention backend for %s on V1 engine.",
+                    dtype,
+                )
+                return FLEX_ATTENTION_V1
+
            # Prefer FlashInfer for Blackwell GPUs if installed
-            if cls.is_device_capability(100):
+            if cls.is_device_capability(100) and \
+                supports_head_size(FLASHINFER_V1, head_size):
                try:
                    import flashinfer  # noqa: F401
                    logger.info_once(
                        "Using FlashInfer backend on V1 engine by default for "
                        "Blackwell (SM 10.0) GPUs.")
-                    return ("vllm.v1.attention.backends."
-                            "flashinfer.FlashInferBackend")
+                    return FLASHINFER_V1
                except ImportError:
                    logger.info_once(
                        "FlashInfer failed to import for V1 engine on "
@ -266,10 +279,13 @@ class CudaPlatformBase(Platform):
                        "install FlashInfer for better performance.")
                    pass
            # FlashAttention is the default for SM 8.0+ GPUs
-            if cls.has_device_capability(80):
+            if cls.has_device_capability(80) and \
+                supports_head_size(FLASH_ATTN_V1, head_size):
                logger.info_once("Using Flash Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "flash_attn.FlashAttentionBackend")
+                return FLASH_ATTN_V1
+
+            logger.info_once("Using FlexAttention backend on V1 engine.")
+            return FLEX_ATTENTION_V1

        # Backends for V0 engine
        if selected_backend == _Backend.FLASHINFER:
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@ -24,12 +24,14 @@ class PoolingParams(
    """

    dimensions: Optional[int] = None
+    use_cross_encoder: bool = False
    additional_data: Optional[Any] = None
    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY

    def clone(self) -> "PoolingParams":
        """Returns a deep copy of the PoolingParams instance."""
        return PoolingParams(dimensions=self.dimensions,
+                             use_cross_encoder=self.use_cross_encoder,
                             additional_data=self.additional_data)

    def verify(self, model_config: "ModelConfig") -> None:
@ -54,6 +56,7 @@ class PoolingParams(
    def __repr__(self) -> str:
        return (f"PoolingParams("
                f"dimensions={self.dimensions}, "
+                f"use_cross_encoder={self.use_cross_encoder}, "
                f"additional_metadata={self.additional_data})")

    def __post_init__(self) -> None:
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -866,24 +866,26 @@ def try_get_generation_config(
            return None


+def get_classification_activation_function(config: PretrainedConfig):
+    return nn.Sigmoid() if config.num_labels == 1 else nn.Softmax()
+
+
 def get_cross_encoder_activation_function(config: PretrainedConfig):
-
    function_name: Optional[str] = None
-    if hasattr(config, "sentence_transformers") and "activation_fn" in \
-        config.sentence_transformers:
+    if (hasattr(config, "sentence_transformers")
+            and "activation_fn" in config.sentence_transformers):
        function_name = config.sentence_transformers["activation_fn"]
-
    elif (hasattr(config, "sbert_ce_default_activation_function")
          and config.sbert_ce_default_activation_function is not None):
        function_name = config.sbert_ce_default_activation_function

    if function_name is not None:
-        assert function_name.startswith("torch.nn.modules."), \
-            "Loading of activation functions is restricted to " \
-            "torch.nn.modules for security reasons"
+        assert function_name.startswith("torch.nn.modules."), (
+            "Loading of activation functions is restricted to "
+            "torch.nn.modules for security reasons")
        return resolve_obj_by_qualname(function_name)()
-    else:
-        return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
+
+    return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()


 def try_get_safetensors_metadata(
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@ -20,9 +20,12 @@ import torch

 import vllm.envs as envs
 from vllm.connections import global_http_connection
+from vllm.logger import init_logger
 from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
 from vllm.version import __version__ as VLLM_VERSION

+logger = init_logger(__name__)
+
 _config_home = envs.VLLM_CONFIG_ROOT
 _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
 _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
@ -183,7 +186,7 @@ class UsageMessage:
                self.gpu_memory_per_device = (
                    torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
            except Exception:
-                pass
+                logger.exception("Failed to collect TPU information")
        self.provider = _detect_cloud_provider()
        self.architecture = platform.machine()
        self.platform = platform.platform()
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@ -3,7 +3,8 @@
 import numpy as np
 import torch

-from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
 from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
                                                TorchSDPAMetadata)
 from vllm.attention.backends.utils import CommonAttentionState
@ -17,9 +18,24 @@ from vllm.v1.worker.cpu_model_runner import CPUModelRunner
 from vllm.v1.worker.gpu_input_batch import InputBatch


-class TorchSDPABackend:
+class TorchSDPABackend(AttentionBackend):
    accept_output_buffer: bool = False

+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return PagedAttention.get_supported_head_sizes()
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
    @staticmethod
    def get_name() -> str:
        return "TORCH_SDPA_VLLM_V1"
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@ -44,10 +44,21 @@ class FlashAttentionBackend(AttentionBackend):

    accept_output_buffer: bool = True

-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
        return [32, 64, 96, 128, 160, 192, 224, 256]

+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
    @staticmethod
    def get_name() -> str:
        return "FLASH_ATTN_VLLM_V1"
@ -416,12 +427,7 @@ class FlashAttentionImpl(AttentionImpl):

        self.num_queries_per_kv = self.num_heads // self.num_kv_heads

-        support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by FlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}. "
-                "Set VLLM_USE_V1=0 to use another attention backend.")
+        FlashAttentionBackend.validate_head_size(head_size)

        if attn_type != AttentionType.DECODER:
            raise NotImplementedError("Encoder self-attention and "
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -38,10 +38,22 @@ class FlashInferBackend(AttentionBackend):

    accept_output_buffer: bool = True

-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
        return [64, 128, 256]

+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
    @staticmethod
    def get_name() -> str:
        return "FLASHINFER_VLLM_V1"
@ -207,14 +219,8 @@ class FlashInferMetadata:
        return self.qo_indptr

    def __post_init__(self):
-        # Refer to
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f" received {self.head_dim}.")
+        if self.head_dim is not None:
+            FlashInferBackend.validate_head_size(self.head_dim)


 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
-
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional

@ -21,9 +21,6 @@ from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable

-if current_platform.is_cuda():
-    pass
-
 logger = init_logger(__name__)

 if TYPE_CHECKING:
@ -45,9 +42,9 @@ def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
 class FlexAttentionBackend(AttentionBackend):
    accept_output_buffer: bool = True

-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
-        return [16, 32, 64, 96, 128, 160, 192, 224, 256]
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        return  # FlexAttention supports any head size

    @staticmethod
    def get_name() -> str:
@ -384,12 +381,8 @@ class FlexAttentionImpl(AttentionImpl):
            raise NotImplementedError(
                "FlexAttention does not support kv sharing yet.")

-        support_head_sizes = FlexAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by FlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}. "
-                "Set VLLM_USE_V1=0 to use another attention backend.")
+        FlexAttentionBackend.validate_head_size(head_size)
+
        if is_quantized_kv_cache(self.kv_cache_dtype):
            raise NotImplementedError(
                "FlexAttention does not support quantized kv-cache. Yet")
@ -463,6 +456,21 @@ class FlexAttentionImpl(AttentionImpl):
        query = query[:, :, :num_actual_tokens, :]
        # Doesn't work for now -> constraint violation
        # torch._dynamo.try_mark_dynamic(query, 2)
+
+        # default M=64, N=64 may run out of shared memory on some GPUs
+        # TODO: Explicit configs for each GPU?
+        # Not sure how to calculate the shared memory requirement
+        extra_kernel_options = defaultdict[str, int](lambda: 64)
+        if query.dtype == torch.float32:
+            extra_kernel_options["BLOCK_M"] //= 2
+            extra_kernel_options["BLOCK_N"] //= 2
+        if current_platform.is_cuda():
+            device_props = torch.cuda.get_device_properties()
+            max_shared_memory = device_props.shared_memory_per_block_optin
+            if max_shared_memory < 144 * 1024:
+                extra_kernel_options["BLOCK_M"] //= 2
+                extra_kernel_options["BLOCK_N"] //= 2
+
        out = flex_attention_compiled(
            query,
            key_cache,
@ -471,7 +479,10 @@ class FlexAttentionImpl(AttentionImpl):
            attn_metadata.block_mask,
            self.scale,
            enable_gqa=enable_gqa,
-            kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
+            kernel_options={
+                "FORCE_USE_FLEX_ATTENTION": True,
+                **extra_kernel_options
+            },
        )

        # Flex doesn't have an out variant today, rely on epilogue fusion
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@ -254,10 +254,21 @@ class MLACommonBackend(AttentionBackend):
    ) -> tuple[int, ...]:
        return (num_blocks, block_size, head_size)

-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
        return [576]

+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+

@dataclass
 class MLACommonPrefillMetadata:
@ -320,12 +331,8 @@ class MLACommonMetadata(Generic[D]):
    prefill: Optional[MLACommonPrefillMetadata] = None

    def __post_init__(self):
-        supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f"received {self.head_dim}.")
+        if self.head_dim is not None:
+            MLACommonBackend.validate_head_size(self.head_dim)


 M = TypeVar("M", bound=MLACommonMetadata)
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@ -314,10 +314,21 @@ class AiterFlashAttentionBackend(AttentionBackend):

    accept_output_buffer: bool = True

-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
        return [32, 64, 96, 128, 160, 192, 224, 256]

+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
    @staticmethod
    def get_name() -> str:
        return "FLASH_ATTN_VLLM_V1"
@ -428,14 +439,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
        assert self.num_heads % self.num_kv_heads == 0
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads

-        support_head_sizes = \
-            AiterFlashAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by "
-                "AiterFlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}. "
-                "Set VLLM_USE_V1=0 to use another attention backend.")
+        AiterFlashAttentionBackend.validate_head_size(head_size)

        if attn_type != AttentionType.DECODER:
            raise NotImplementedError("Encoder self-attention and "
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@ -190,10 +190,21 @@ class TritonAttentionBackend(AttentionBackend):

    accept_output_buffer: bool = True

-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
        return [32, 64, 96, 128, 160, 192, 224, 256]

+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
    @staticmethod
    def get_name() -> str:
        return "TRITON_ATTN_VLLM_V1"
@ -268,11 +279,7 @@ class TritonAttentionImpl(AttentionImpl):

        self.num_queries_per_kv = self.num_heads // self.num_kv_heads

-        support_head_sizes = TritonAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by TritonAttention. "
-                f"Supported head sizes are: {support_head_sizes}.")
+        TritonAttentionBackend.validate_head_size(head_size)

        if attn_type != AttentionType.DECODER:
            raise NotImplementedError("Encoder self-attention and "
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@ -101,7 +101,10 @@ class TopKTopPSampler(nn.Module):
                           "per-request generators. Falling back to "
                           "PyTorch-native implementation.")
            return self.forward_native(logits, generators, k, p)
-        return flashinfer_sample(logits, k, p, generators)
+        # flashinfer sampling functions expect contiguous logits.
+        # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
+        # because of slicing operation in logits_processor.
+        return flashinfer_sample(logits.contiguous(), k, p, generators)

    def forward_tpu(
        self,
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -12,8 +12,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
-from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata,
-                                                   FlashAttentionMetadata)
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import prepare_eagle_input_kernel
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -528,19 +528,20 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                    start_token_index:end_token_index] = new_token_ids
                self.input_batch.num_tokens_no_spec[
                    req_index] = end_token_index
-                # Add spec_token_ids to token_ids_cpu.
-                spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens.get(
-                        req_id, ()))
-                if spec_token_ids:
-                    start_index = end_token_index
-                    end_token_index += len(spec_token_ids)
-                    self.input_batch.token_ids_cpu[
-                        req_index,
-                        start_index:end_token_index] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec tokens.
                self.input_batch.num_tokens[req_index] = end_token_index

+            # Add spec_token_ids to token_ids_cpu.
+            spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
+            if spec_token_ids:
+                num_spec_tokens = len(spec_token_ids)
+                start_index = self.input_batch.num_tokens_no_spec[req_index]
+                end_token_index = start_index + num_spec_tokens
+                self.input_batch.token_ids_cpu[
+                    req_index, start_index:end_token_index] = spec_token_ids
+                # NOTE(woosuk): `num_tokens` here may include spec tokens.
+                self.input_batch.num_tokens[req_index] += num_spec_tokens
+
        # Add the new or resumed requests to the persistent batch.
        # The smaller empty indices are filled first.
        for req_id in req_ids_to_add:
Author	SHA1	Message	Date
simon-mo	a5dd03c1eb	Revert "[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 )" This reverts commit e202dd2736bc575b11250b15311512d19d3225d5.	2025-07-06 14:02:36 -07:00
Cyrus Leung	c18b3b8e8b	[Bugfix] Add `use_cross_encoder` flag to use correct activation in `ClassifierPooler` (#20527 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-06 14:01:48 -07:00
Woosuk Kwon	9528e3a05e	[BugFix][Spec Decode] Fix spec token ids in model runner (#20530 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-07-06 19:44:52 +00:00
Cyrus Leung	9fb52e523a	[V1] Support any head size for FlexAttention backend (#20467 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-06 09:54:36 -07:00
Woosuk Kwon	e202dd2736	[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com>	2025-07-06 08:48:13 -07:00
Reid	43813e6361	[Misc] call the pre-defined func (#20518 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-06 10:25:29 +00:00
Brayden Zhong	cede942b87	[Benchmark] Add support for multiple batch size benchmark through CLI in `benchmark_moe.py` (#20516 ) Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>	2025-07-06 09:20:11 +00:00
Flora Feng	fe1e924811	[Frontend] Support image object in llm.chat (#19635 ) Signed-off-by: sfeng33 <4florafeng@gmail.com> Signed-off-by: Flora Feng <4florafeng@gmail.com>	2025-07-06 06:47:13 +00:00
Chengji Yao	4548c03c50	[TPU][Bugfix] fix the MoE OOM issue (#20339 ) Signed-off-by: Chengji Yao <chengjiyao@google.com>	2025-07-05 21:19:09 -07:00
Lucas Wilkinson	40b86aa05e	[BugFix] Fix: ImportError when building on hopper systems (#20513 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-07-06 12:17:30 +08:00
Lucia Fang	432870829d	[Bugfix] Fix missing per_act_token parameter in compressed_tensors_moe (#20509 ) Signed-off-by: Lu Fang <fanglu@fb.com>	2025-07-06 12:08:30 +08:00
Vadim Gimpelson	f73d02aadc	[BUG] Fix #20484 . Support empty sequence in cuda penalty kernel (#20491 ) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>	2025-07-05 19:38:02 -07:00
Jeremy Reizenstein	c5ebe040ac	test_attention compat with coming xformers change (#20487 ) Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-07-05 19:37:59 -07:00
Reid	8d763cb891	[Misc] remove unused import (#20517 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-05 19:17:06 -07:00
Reid	cf4cd53982	[Misc] Add logger.exception for TPU information collection failures (#20510 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-05 07:24:32 -07:00
Isotr0py	32c9be2200	[v1] Re-add fp32 support to v1 engine through FlexAttention (#19754 ) Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-07-05 09:41:10 +00:00
Lucia Fang	8aeaa910a2	Fix unknown attribute of topk_indices_dtype in CompressedTensorsW8A8Fp8MoECutlassMethod (#20507 ) Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>	2025-07-05 14:03:20 +08:00
Jee Jee Li	906e05d840	[Misc] Remove the unused LoRA test code (#20494 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-07-05 13:48:16 +08:00
Reid	ef9a2990ae	[doc] small fix (#20506 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-04 20:56:39 -07:00
Reid	7e90870491	[Misc] Add security warning for development mode endpoints (#20508 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-04 20:52:13 -07:00
Guy Stone	d3f05c9248	[Doc] fix mutltimodal_inputs.md gh examples link (#20497 ) Signed-off-by: Guy Stone <guys@spotify.com>	2025-07-04 16:41:35 -07:00
Michael Goin	c108781c85	[CI Bugfix] Fix pre-commit failures on main (#20502 )	2025-07-04 14:17:30 -07:00