[BugFix] Fix default kv-cache-dtype default for DeepseekV3.2 (#25988 )

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: simon-mo <simon.mo@hey.com>
[Bugfix] Fix __syncwarp on ROCM (#25996 )
2025-10-20 23:03:52 +08:00 · 2025-09-30 22:47:42 -07:00 · 2025-09-30 22:47:42 -07:00 · 2025-09-30 22:47:42 -07:00 · 2025-09-30 22:47:38 -07:00 · 2025-09-30 22:47:11 -07:00
323 changed files with 12578 additions and 4946 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"

  # Add job to create multi-arch manifest
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -58,11 +58,8 @@ function cpu_tests() {
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

-    # Note: disable Bart until supports V1
-    pytest -x -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
+    pytest -x -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model

    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -35,7 +35,7 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -300,10 +300,12 @@ steps:
    - pytest -v -s v1/spec_decode
    - pytest -v -s v1/kv_connector/unit
    - pytest -v -s v1/metrics
+    - pytest -v -s v1/test_kv_sharing.py
+    - pytest -v -s v1/test_metrics_reader.py
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
-    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_metrics_reader.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -770,8 +772,9 @@ steps:
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

 - label: Blackwell Test # 38 min
  timeout_in_minutes: 60
@ -869,25 +872,27 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

- label: Distributed Tests (2 GPUs) # 110min
-  timeout_in_minutes: 150
+- label: Distributed Tests (2 GPUs) # 68min
+  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
+  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  - vllm/compilation
  - vllm/worker/worker_base.py
-  - entrypoints/llm/test_collective_rpc.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
  - tests/v1/test_async_llm_dp.py
  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
-  - vllm/v1/engine/
-  - vllm/v1/worker/
+  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@ -897,20 +902,29 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
-  # test sequence parallel
-  - pytest -v -s distributed/test_sequence_parallel.py
-  # this test fails consistently.
-  # TODO: investigate and fix
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-  - pytest -v -s models/multimodal/generation/test_maverick.py
-  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -13,6 +13,7 @@ build:

 mkdocs:
  configuration: mkdocs.yaml
+  fail_on_warning: true

 # Optionally declare the Python requirements required to build your docs
 python:
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -1,17 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
+import time
+from unittest import mock

 import numpy as np
 from tabulate import tabulate

 from benchmark_utils import TimeCollector
-from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
+from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner


-def main(args):
+def benchmark_propose(args):
    rows = []
    for max_ngram in args.max_ngram:
        collector = TimeCollector(TimeCollector.US)
@ -69,10 +83,88 @@ def main(args):
    )


+def benchmark_batched_propose(args):
+    NUM_SPECULATIVE_TOKENS_NGRAM = 10
+    PROMPT_LOOKUP_MIN = 5
+    PROMPT_LOOKUP_MAX = 15
+    MAX_MODEL_LEN = int(1e7)
+    DEVICE = current_platform.device_type
+
+    model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
+
+    speculative_config = SpeculativeConfig(
+        target_model_config=model_config,
+        target_parallel_config=ParallelConfig(),
+        method="ngram",
+        num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
+        prompt_lookup_max=PROMPT_LOOKUP_MAX,
+        prompt_lookup_min=PROMPT_LOOKUP_MIN,
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        speculative_config=speculative_config,
+        device_config=DeviceConfig(device=current_platform.device_type),
+        parallel_config=ParallelConfig(),
+        load_config=LoadConfig(),
+        scheduler_config=SchedulerConfig(),
+    )
+
+    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
+    mock_pp_group = mock.MagicMock()
+    mock_pp_group.world_size = 1
+    with mock.patch(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
+    ):
+        runner = GPUModelRunner(vllm_config, DEVICE)
+
+        # hack max model len
+        runner.max_model_len = MAX_MODEL_LEN
+        runner.drafter.max_model_len = MAX_MODEL_LEN
+
+        dummy_input_batch = InputBatch(
+            max_num_reqs=args.num_req,
+            max_model_len=MAX_MODEL_LEN,
+            max_num_batched_tokens=args.num_req * args.num_token,
+            device=DEVICE,
+            pin_memory=False,
+            vocab_size=256000,
+            block_sizes=[16],
+        )
+        dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
+        dummy_input_batch.spec_decode_unsupported_reqs = ()
+        dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
+        dummy_input_batch.token_ids_cpu = np.random.randint(
+            0, 20, (args.num_req, args.num_token)
+        )
+
+        runner.input_batch = dummy_input_batch
+
+        sampled_token_ids = [[0]] * args.num_req
+
+        print("Starting benchmark")
+        # first run is warmup so ignore it
+        for _ in range(args.num_iteration):
+            start = time.time()
+            runner.drafter.propose(
+                sampled_token_ids,
+                dummy_input_batch.req_ids,
+                dummy_input_batch.num_tokens_no_spec,
+                dummy_input_batch.token_ids_cpu,
+                dummy_input_batch.spec_decode_unsupported_reqs,
+            )
+            end = time.time()
+            print(f"Iteration time (s): {end - start}")
+
+
 def invoke_main() -> None:
    parser = FlexibleArgumentParser(
        description="Benchmark the performance of N-gram speculative decode drafting"
    )
+    parser.add_argument(
+        "--batched", action="store_true", help="consider time to prepare batch"
+    )  # noqa: E501
    parser.add_argument(
        "--num-iteration",
        type=int,
@ -105,8 +197,17 @@ def invoke_main() -> None:
        help="Number of speculative tokens to generate",
    )
    args = parser.parse_args()
-    main(args)
+
+    if not args.batched:
+        benchmark_propose(args)
+    else:
+        benchmark_batched_propose(args)


+"""
+# Example command lines:
+# time python3 benchmarks/benchmark_ngram_proposer.py
+# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
+"""  # noqa: E501
 if __name__ == "__main__":
    invoke_main()  # pragma: no cover
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv

@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@ -3,6 +3,7 @@
 import argparse
 import copy
 import itertools
+import os

 import torch
 from weight_shapes import WEIGHT_SHAPES
@ -23,21 +24,45 @@ PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "nvfp4": dict(no_a_quant=False, enabled=True),
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+    "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
+    "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
 }

+_needs_fbgemm = any(
+    v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
+)
+if _needs_fbgemm:
+    try:
+        from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
+            triton_scale_nvfp4_quant,
+        )
+    except ImportError:
+        print(
+            "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
+            "These providers will be skipped. Please install fbgemm_gpu with: "
+            "'pip install fbgemm-gpu-genai' to run them."
+        )
+        # Disable FBGEMM providers so the benchmark can run.
+        for cfg in PROVIDER_CFGS.values():
+            if cfg.get("fbgemm"):
+                cfg["enabled"] = False
+
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]


-def _quant_weight_nvfp4(b: torch.Tensor, device: str):
+def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
    # Compute global scale for weight
    b_amax = torch.abs(b).max().to(torch.float32)
    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
-    b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
+    if "fbgemm" in cfg and cfg["fbgemm"]:
+        b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
+    else:
+        b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
    return b_fp4, scale_b_fp4, b_global_scale


 def build_nvfp4_runner(cfg, a, b, dtype, device):
-    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
+    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)

    # Compute global scale for activation
    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):

    # Alpha for the GEMM operation
    alpha = 1.0 / (a_global_scale * b_global_scale)
+    if "fbgemm" in cfg and cfg["fbgemm"]:
+        if cfg["no_a_quant"]:
+            a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+
+            def run():
+                return torch.ops.fbgemm.f4f4bf16(
+                    a_fp4,
+                    b_fp4,
+                    scale_a_fp4,
+                    scale_b_fp4,
+                    global_scale=alpha,
+                    use_mx=False,
+                )
+
+            return run
+        else:
+
+            def run():
+                a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+                return torch.ops.fbgemm.f4f4bf16(
+                    a_fp4,
+                    b_fp4,
+                    scale_a_fp4,
+                    scale_b_fp4,
+                    global_scale=alpha,
+                    use_mx=False,
+                )
+
+            return run

    if cfg["no_a_quant"]:
        # Pre-quantize activation
@ -130,10 +184,13 @@ if __name__ == "__main__":

    for K, N, model in prepare_shapes(args):
        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+        save_dir = f"bench_nvfp4_res_n{N}_k{K}"
+        os.makedirs(save_dir, exist_ok=True)
+
        benchmark.run(
            print_data=True,
            show_plots=True,
-            save_path=f"bench_nvfp4_res_n{N}_k{K}",
+            save_path=save_dir,
            N=N,
            K=K,
        )
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -51,7 +51,7 @@ def calculate_diff(
 ):
    """Calculate the difference between Inductor and CUDA implementations."""
    device = torch.device("cuda")
-    x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
+    x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)

    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)

@ -59,23 +59,25 @@ def calculate_diff(
    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)

-    out_allclose = lambda o1, o2: torch.allclose(
-        o1.to(torch.float32),
-        o2.to(torch.float32),
-        rtol=1e-3,
-        atol=1e-5,
-    )
-    scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
-
-    if (
-        out_allclose(cuda_out, torch_out)
-        and scale_allclose(cuda_scale, torch_scale)
-        and out_allclose(cuda_out, torch_eager_out)
-        and scale_allclose(cuda_scale, torch_eager_scale)
-    ):
+    try:
+        torch.testing.assert_close(
+            cuda_out.to(torch.float32),
+            torch_out.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
+        torch.testing.assert_close(
+            cuda_out.to(torch.float32),
+            torch_eager_out.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
        print("✅ All implementations match")
-    else:
+    except AssertionError as e:
        print("❌ Implementations differ")
+        print(e)


 configs = []
@ -91,7 +93,7 @@ def benchmark_quantization(
 ):
    device = torch.device("cuda")

-    x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
+    x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]
    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
@ -157,21 +159,21 @@ if __name__ == "__main__":
    )
    parser.add_argument("-c", "--check", action="store_true")
    parser.add_argument(
-        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
    )
    parser.add_argument(
        "--hidden-sizes",
        type=int,
        nargs="+",
-        default=None,
-        help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
+        default=[896, 1024, 2048, 4096, 7168],
+        help="Hidden sizes to benchmark",
    )
    parser.add_argument(
        "--batch-sizes",
        type=int,
        nargs="+",
-        default=None,
-        help="Batch sizes to benchmark (default: 1,16,32,64,128)",
+        default=[1, 16, 128, 512, 1024],
+        help="Batch sizes to benchmark",
    )
    parser.add_argument(
        "--group-sizes",
@ -192,8 +194,8 @@ if __name__ == "__main__":

    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]

-    hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-    batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
+    hidden_sizes = args.hidden_sizes
+    batch_sizes = args.batch_sizes

    if args.group_sizes is not None:
        group_shapes = []
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(


 def make_rand_tensors(
-    a_shape: tuple[int],
-    b_shape: tuple[int],
-    c_shape: tuple[int],
+    a_shape: tuple[int, ...],
+    b_shape: tuple[int, ...],
+    c_shape: tuple[int, ...],
    a_dtype: torch.dtype,
    b_dtype: torch.dtype,
    c_dtype: torch.dtype,
@ -243,7 +243,7 @@ class OpType(Enum):
        lora_rank: int,
        num_loras: int,
        num_slices: int,
-    ) -> tuple[tuple[int], tuple[int], tuple[int]]:
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        """
        Given num_slices, return the shapes of the A, B, and C matrices
        in A x B = C, for the op_type
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -584,8 +584,9 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
    elif config.architectures[0] in (
-        "DeepseekV3ForCausalLM",
        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+        "DeepseekV32ForCausalLM",
        "Glm4MoeForCausalLM",
    ):
        E = config.n_routed_experts
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -8,12 +8,16 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    get_col_major_tma_aligned_tensor,
    per_token_group_quant_fp8,
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
-from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
+from vllm.utils.deep_gemm import (
+    calc_diff,
+    fp8_gemm_nt,
+    get_col_major_tma_aligned_tensor,
+    per_block_cast_to_fp8,
+)


 def benchmark_shape(m: int,
@ -59,7 +63,7 @@ def benchmark_shape(m: int,

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_triton_block_scaled_mm(A_vllm,
+        return w8a8_block_fp8_matmul(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
                                     B_scale_vllm,
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -101,6 +101,7 @@ else()
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
+    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
 endif()

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
@ -177,8 +178,14 @@ elseif (S390_FOUND)
        "-mzvector"
        "-march=native"
        "-mtune=native")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+    if(RVV_FOUND)
+	    message(FAIL_ERROR "Can't support rvv now.")
+    else()
+        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
+    endif()
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()

 #
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
 else()
  FetchContent_Declare(
        flashmla
-        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
+        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
+        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -33,23 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
-cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+
+set(SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
+    list(APPEND SUPPORT_ARCHS 9.0a)
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
+    list(APPEND SUPPORT_ARCHS 10.0a)
+endif()
+
+
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
+if(FLASH_MLA_ARCHS)
+    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
+
    set(FlashMLA_SOURCES
-        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
+        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
+        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
+        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
+    )
+
+    set(FlashMLA_Extension_SOURCES
+        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+    )

    set(FlashMLA_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc)
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+    )
+
+    set(FlashMLA_Extension_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/sm90
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+    )

    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")

+    set_gencode_flags_for_srcs(
+        SRCS "${FlashMLA_Extension_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+
    define_gpu_extension_target(
        _flashmla_C
        DESTINATION vllm
@ -60,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
        USE_SABI 3
        WITH_SOABI)
+
+    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    target_compile_options(_flashmla_C PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
+
+    define_gpu_extension_target(
+        _flashmla_extension_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${FlashMLA_Extension_SOURCES}
+        COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
+        USE_SABI 3
+        WITH_SOABI)
+
+    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    target_compile_options(_flashmla_extension_C PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create an empty target for setup.py when not targeting sm90a systems
+    # Create empty targets for setup.py when not targeting sm90a systems
    add_custom_target(_flashmla_C)
+    add_custom_target(_flashmla_extension_C)
 endif()

--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -56,3 +56,11 @@ void cp_gather_cache(
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
+
+// Indexer K quantization and cache function
+void indexer_k_quant_and_cache(
+    torch::Tensor& k,             // [num_tokens, head_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
+    torch::Tensor& slot_mapping,  // [num_tokens]
+    int64_t quant_block_size,     // quantization block size
+    const std::string& scale_fmt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -16,6 +16,7 @@

 #include <algorithm>
 #include <cassert>
+#include <cfloat>  // FLT_MIN
 #include <map>
 #include <vector>

@ -396,6 +397,180 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_ds_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+  const int64_t dst_idx_start =
+      block_idx * block_stride + block_offset * entry_stride;
+
+  // Create 4 tile scales in shared memory
+  __shared__ float smem[20];
+  float* shard_abs_max = smem;
+  float* tile_scales = smem + 16;
+
+  // For the NoPE part, each tile of 128 elements is handled by 4 warps
+  // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
+  // The first thread of the first warp in each tile writes the scale
+  // value for the tile. The RoPE part (last 64 elements) is handled
+  // by another 2 warps (64 threads).
+  // So in total, we use 18 warps (576 threads) per block.
+
+  // Cast kv_cache to 16_bit for RoPE values
+  scalar_t* kv_cache_16bit =
+      reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
+
+  // The last 64 threads handle the RoPE part
+  if (threadIdx.x >= kv_lora_rank) {
+    const int8_t pe_idx = threadIdx.x - kv_lora_rank;
+    const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
+    // RoPE values start after the packed 8-bit NoPE values and the
+    // 32-bit scales
+    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
+    kv_cache_16bit[dst_idx] = k_pe[src_idx];
+    return;
+  }
+
+  // Determine the scale for each chunk of NoPE
+  const int16_t tile_idx = threadIdx.x >> 7;
+  const int16_t warp_idx = (threadIdx.x & 127) >> 5;
+  const int16_t lane_idx = threadIdx.x & 31;
+
+  // Load the NoPE element for this thread into registers
+  const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
+  const scalar_t src_val = kv_c[src_idx];
+
+  // Warp-level reduction to find the max absolute value in the warp
+  float max_abs = fabsf(src_val);
+#pragma unroll
+  for (int offset = 16; offset > 0; offset /= 2) {
+#ifdef USE_ROCM
+    max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
+#else
+    max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
+#endif
+  }
+
+  // The first lane of each warp in each tile writes the max_abs of this part
+  // of the tile to shared memory
+  if (lane_idx == 0) {
+    shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
+  }
+  __syncthreads();
+
+  // The first lane of the first warp in each tile computes the scale for the
+  // tile and writes it to shared memory and to kv_cache
+  if (warp_idx == 0 && lane_idx == 0) {
+    float4 shard_abs_max_vec =
+        reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
+    float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
+                             fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
+                       448.f;
+
+    // Avoid division by zero in `scaled_convert`
+    tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
+    float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
+    const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
+    kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
+  }
+
+  __syncthreads();
+
+  // Now all threads in the block scale and write their element
+  const float scale_val = tile_scales[tile_idx];
+  const int64_t dst_idx = dst_idx_start + threadIdx.x;
+  kv_cache[dst_idx] =
+      fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
+          src_val, scale_val);
+}
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void indexer_k_quant_and_cache_kernel(
+    const scalar_t* __restrict__ k,  // [num_tokens, head_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, cache_stride]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int head_dim,                        // dimension of each head
+    const int quant_block_size,                // quantization block size
+    const int cache_block_size,                // cache block size
+    const int cache_stride,  // stride for each token in kv_cache
+    const bool use_ue8m0     // use ue8m0 scale format
+) {
+  constexpr int VEC_SIZE = 4;
+  const int64_t token_idx = blockIdx.x;
+  const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x +
+                                threadIdx.y * blockDim.x + threadIdx.x) *
+                               VEC_SIZE;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  const int64_t block_idx = slot_idx / cache_block_size;
+  const int64_t block_offset = slot_idx % cache_block_size;
+
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0 || (head_dim_idx >= head_dim)) {
+    return;
+  }
+
+  float2 k_val = (reinterpret_cast<const float2*>(
+      k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE];
+  scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val);
+  float amax = 0.0f;
+  for (int i = 0; i < VEC_SIZE; i++) {
+    amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
+  }
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+
+  // Reduced amax
+  for (int mask = 16; mask > 0; mask /= 2) {
+#ifdef USE_ROCM
+    amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask));
+#else
+    amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
+#endif
+  }
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+  float scale = fmaxf(amax, 1e-4) / 448.0f;
+  if (use_ue8m0) {
+    scale = exp2f(ceilf(log2f(scale)));
+  }
+
+  const int64_t dst_offset = block_idx * cache_block_size * cache_stride +
+                             block_offset * head_dim + head_dim_idx;
+  for (int i = 0; i < VEC_SIZE; i++) {
+    kv_cache[dst_offset + i] =
+        fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale);
+  }
+  if (threadIdx.x == 0) {
+    const int64_t dst_scale_idx =
+        block_idx * cache_block_size * cache_stride +
+        cache_block_size * head_dim +
+        (block_offset * head_dim + head_dim_idx) * 4 / quant_block_size;
+    reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale;
+  }
+}
+
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@ -438,7 +613,7 @@ void reshape_and_cache(
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
-                             CALL_RESHAPE_AND_CACHE)
+                             CALL_RESHAPE_AND_CACHE);
 }

 // KV_T is the data type of key and value tensors.
@ -509,6 +684,18 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+#define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE)           \
+  vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE>         \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -531,20 +718,44 @@ void concat_and_cache_mla(
  int pe_dim = k_pe.size(1);
  int block_size = kv_cache.size(1);

-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  if (kv_cache_dtype == "fp8_ds_mla") {
+    TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla");
+    TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla");
+    TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(),
+                "kv_cache.size(2) must be 656 bytes for fp8_ds_mla");
+    TORCH_CHECK(kv_c.itemsize() == 2,
+                "kv_c.itemsize() must be 2 for fp8_ds_mla");
+    TORCH_CHECK(k_pe.itemsize() == 2,
+                "k_pe.itemsize() must be 2 for fp8_ds_mla");
+  } else {
+    TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  }

  int kv_c_stride = kv_c.stride(0);
  int k_pe_stride = k_pe.stride(0);
  int block_stride = kv_cache.stride(0);
  int entry_stride = kv_cache.stride(1);

-  dim3 grid(num_tokens);
-  dim3 block(std::min(kv_lora_rank, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                             CALL_CONCAT_AND_CACHE_MLA);
+  if (kv_cache_dtype == "fp8_ds_mla") {
+    dim3 grid(num_tokens);
+    // For the NoPE part, each tile of 128 elements is handled by 4 warps
+    // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
+    // The first thread of the first warp in each tile writes the scale
+    // value for the tile. The RoPE part (last 64 elements) is handled
+    // by another 2 warps (64 threads).
+    // So in total, we use 18 warps (576 threads) per block.
+    dim3 block(576);
+    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                               CALL_CONCAT_AND_CACHE_DS_MLA);
+  } else {
+    dim3 grid(num_tokens);
+    dim3 block(std::min(kv_lora_rank, 512));
+    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                               CALL_CONCAT_AND_CACHE_MLA);
+  }
 }

 namespace vllm {
@ -922,3 +1133,42 @@ void cp_gather_cache(
    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
  }
 }
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \
+  vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(k.data_ptr()),                        \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \
+          cache_block_size, cache_stride, use_ue8m0);
+
+void indexer_k_quant_and_cache(
+    torch::Tensor& k,             // [num_tokens, head_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
+    torch::Tensor& slot_mapping,  // [num_tokens]
+    int64_t quant_block_size,     // quantization block size
+    const std::string& scale_fmt) {
+  int num_tokens = k.size(0);
+  int head_dim = k.size(1);
+  int cache_block_size = kv_cache.size(1);
+  int cache_stride = kv_cache.size(2);
+  bool use_ue8m0 = scale_fmt == "ue8m0";
+
+  TORCH_CHECK(k.device() == kv_cache.device(),
+              "k and kv_cache must be on the same device");
+  TORCH_CHECK(k.device() == slot_mapping.device(),
+              "k and slot_mapping must be on the same device");
+  TORCH_CHECK(head_dim % quant_block_size == 0,
+              "head_dim must be divisible by quant_block_size");
+
+  constexpr int vec_size = 4;
+  dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) /
+                            (quant_block_size * vec_size));
+  dim3 block(32, vec_size);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
+                             CALL_INDEXER_K_QUANT_AND_CACHE);
+}
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@ -14,7 +14,8 @@
  // arm implementation
  #include "cpu_types_arm.hpp"
 #else
-  #warning "unsupported vLLM cpu implementation"
+  #warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
+  #include "cpu_types_scalar.hpp"
 #endif

 #ifdef _OPENMP
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@ -0,0 +1,513 @@
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <torch/all.h>
+#include "float_convert.hpp"
+
+namespace vec_op {
+
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+#define __max(a, b) ((a) > (b) ? (a) : (b))
+#define __min(a, b) ((a) < (b) ? (a) : (b))
+#define __abs(a) ((a) < (0) ? (0 - a) : (a))
+
+typedef struct f16x8_t {
+  uint16_t val[8];
+} f16x8_t;
+
+typedef struct f16x16_t {
+  uint16_t val[16];
+} f16x16_t;
+
+typedef struct f16x32_t {
+  uint16_t val[32];
+} f16x32_t;
+
+typedef struct f32x4_t {
+  float val[4];
+} f32x4_t;
+
+typedef struct f32x8_t {
+  float val[8];
+} f32x8_t;
+
+typedef struct f32x16_t {
+  float val[16];
+} f32x16_t;
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+};
+};  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T> > >
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  f16x8_t reg;
+
+  explicit FP16Vec8(const void* ptr)
+      : reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
+
+  explicit FP16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
+};
+
+struct FP16Vec16 : public Vec<FP16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  f16x16_t reg;
+
+  explicit FP16Vec16(const void* ptr)
+      : reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
+
+  explicit FP16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
+
+  void save(void* ptr, const int elem_num) const {
+    int num = __min(elem_num, VEC_ELEM_NUM);
+    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
+  }
+};
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  f16x8_t reg;
+
+  explicit BF16Vec8(const void* ptr)
+      : reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
+
+  explicit BF16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  f16x16_t reg;
+
+  explicit BF16Vec16(const void* ptr)
+      : reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
+
+  explicit BF16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
+
+  void save(void* ptr, const int elem_num) const {
+    int num = __min(elem_num, VEC_ELEM_NUM);
+    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
+  }
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+  f16x32_t reg;
+
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
+
+  explicit BF16Vec32(f16x32_t data) : reg(data) {};
+
+  explicit BF16Vec32(BF16Vec8& vec8_data) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
+    }
+  }
+
+  void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+
+  f32x4_t reg;
+
+  explicit FP32Vec4(float v) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = v;
+    }
+  }
+
+  explicit FP32Vec4() {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = 0.0f;
+    }
+  }
+
+  explicit FP32Vec4(const float* ptr)
+      : reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
+
+  explicit FP32Vec4(f32x4_t data) : reg(data) {};
+
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  f32x8_t reg;
+
+  explicit FP32Vec8(float v) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = v;
+    }
+  }
+
+  explicit FP32Vec8() {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = 0.0f;
+    }
+  }
+
+  explicit FP32Vec8(const float* ptr)
+      : reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
+
+  explicit FP32Vec8(f32x8_t data) : reg(data) {};
+
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
+
+  explicit FP32Vec8(const FP16Vec8& v) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = fp16_to_float(v.reg.val[i]);
+    }
+  }
+
+  FP32Vec8(const BF16Vec8& v) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = bf16_to_float(v.reg.val[i]);
+    }
+  }
+
+  float reduce_sum() const {
+    float result = 0;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result += reg.val[i];
+    }
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    f32x8_t ret;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      ret.val[i] = expf(reg.val[i]);
+    }
+    return FP32Vec8(ret);
+  }
+
+  FP32Vec8 tanh() const {
+    f32x8_t ret;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      ret.val[i] = tanhf(reg.val[i]);
+    }
+    return FP32Vec8(ret);
+  }
+
+  FP32Vec8 er() const {
+    f32x8_t ret;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      ret.val[i] = erf(reg.val[i]);
+    }
+    return FP32Vec8(ret);
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    f32x8_t ret;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      ret.val[i] = reg.val[i] * b.reg.val[i];
+    }
+    return FP32Vec8(ret);
+  }
+
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    f32x8_t ret;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      ret.val[i] = reg.val[i] + b.reg.val[i];
+    }
+    return FP32Vec8(ret);
+  }
+
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    f32x8_t ret;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      ret.val[i] = reg.val[i] - b.reg.val[i];
+    }
+    return FP32Vec8(ret);
+  }
+
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    f32x8_t ret;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      ret.val[i] = reg.val[i] / b.reg.val[i];
+    }
+    return FP32Vec8(ret);
+  }
+
+  void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  f32x16_t reg;
+
+  explicit FP32Vec16(float v) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = v;
+    }
+  }
+
+  explicit FP32Vec16() {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = 0.0f;
+    }
+  }
+
+  explicit FP32Vec16(const float* ptr)
+      : reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
+
+  explicit FP32Vec16(f32x16_t data) : reg(data) {};
+
+  FP32Vec16(const FP32Vec4& data) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
+    }
+  }
+
+  FP32Vec16(const FP32Vec8& data) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
+    }
+  }
+
+  FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
+
+  explicit FP32Vec16(const FP16Vec16& v) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = fp16_to_float(v.reg.val[i]);
+    }
+  }
+
+  explicit FP32Vec16(const BF16Vec16& v) {
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      reg.val[i] = bf16_to_float(v.reg.val[i]);
+    }
+  }
+
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
+
+  FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    FP32Vec16 result(0.0f);
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result.reg.val[i] = reg.val[i] * b.reg.val[i];
+    }
+    return result;
+  }
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    FP32Vec16 result(0.0f);
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result.reg.val[i] = reg.val[i] + b.reg.val[i];
+    }
+    return result;
+  }
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    FP32Vec16 result(0.0f);
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result.reg.val[i] = reg.val[i] - b.reg.val[i];
+    }
+    return result;
+  }
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    FP32Vec16 result(0.0f);
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result.reg.val[i] = reg.val[i] / b.reg.val[i];
+    }
+    return result;
+  }
+
+  FP32Vec16 max(const FP32Vec16& b) const {
+    FP32Vec16 result(0.0f);
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
+    }
+    return result;
+  }
+
+  FP32Vec16 min(const FP32Vec16& b) const {
+    FP32Vec16 result(0.0f);
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
+    }
+    return result;
+  }
+
+  FP32Vec16 abs() const {
+    FP32Vec16 result(0.0f);
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result.reg.val[i] = __abs(reg.val[i]);
+    }
+    return result;
+  }
+
+  float reduce_sum() const {
+    float result = 0.0f;
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result += reg.val[i];
+    }
+    return result;
+  }
+
+  float reduce_max() const {
+    float result = reg.val[0];
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result = __max(reg.val[i], result);
+    }
+    return result;
+  }
+
+  float reduce_min() const {
+    float result = reg.val[0];
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      result = __min(reg.val[i], result);
+    }
+    return result;
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    float sum = 0.0;
+    int start = idx * group_size;
+    int end = (idx + 1) * group_size;
+
+    for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
+      sum += reg.val[start];
+    }
+
+    return sum;
+  }
+
+  void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
+};
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
+
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+};
+
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+
+/*
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  c10::Half __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::Half *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+*/
+
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  uint16_t fp16 = float_to_fp16(v);
+  *reinterpret_cast<uint16_t*>(ptr) = fp16;
+}
+
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
+  int i = 0;
+  for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
+    reg.val[i] = float_to_fp16(v.reg.val[i]);
+  }
+}
+
+inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
+  int i = 0;
+  for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
+    reg.val[i] = float_to_fp16(v.reg.val[i]);
+  }
+}
+
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+  acc = acc + a * b;
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
+  int i = 0;
+  for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
+    reg.val[i] = float_to_bf16(v.reg.val[i]);
+  }
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
+  int i = 0;
+  for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
+    reg.val[i] = float_to_bf16(v.reg.val[i]);
+  }
+}
+
+inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
+
+};  // namespace vec_op
--- a/csrc/cpu/float_convert.hpp
+++ b/csrc/cpu/float_convert.hpp
@ -0,0 +1,106 @@
+
+static float bf16_to_float(uint16_t bf16) {
+  uint32_t bits = static_cast<uint32_t>(bf16) << 16;
+  float fp32;
+  std::memcpy(&fp32, &bits, sizeof(fp32));
+  return fp32;
+}
+
+static uint16_t float_to_bf16(float fp32) {
+  uint32_t bits;
+  std::memcpy(&bits, &fp32, sizeof(fp32));
+  return static_cast<uint16_t>(bits >> 16);
+}
+
+/************************************************
+ * Copyright (c) 2015 Princeton Vision Group
+ * Licensed under the MIT license.
+ * Codes below copied from
+ * https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
+ *************************************************/
+static uint16_t float_to_fp16(float fp32) {
+  uint16_t fp16;
+
+  unsigned x;
+  unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
+  unsigned sign, exponent, mantissa;
+
+  std::memcpy(&x, &fp32, sizeof(fp32));
+  u = (x & 0x7fffffff);
+
+  // Get rid of +NaN/-NaN case first.
+  if (u > 0x7f800000) {
+    fp16 = 0x7fffU;
+    return fp16;
+  }
+
+  sign = ((x >> 16) & 0x8000);
+
+  // Get rid of +Inf/-Inf, +0/-0.
+  if (u > 0x477fefff) {
+    fp16 = sign | 0x7c00U;
+    return fp16;
+  }
+  if (u < 0x33000001) {
+    fp16 = (sign | 0x0000);
+    return fp16;
+  }
+
+  exponent = ((u >> 23) & 0xff);
+  mantissa = (u & 0x7fffff);
+
+  if (exponent > 0x70) {
+    shift = 13;
+    exponent -= 0x70;
+  } else {
+    shift = 0x7e - exponent;
+    exponent = 0;
+    mantissa |= 0x800000;
+  }
+  lsb = (1 << shift);
+  lsb_s1 = (lsb >> 1);
+  lsb_m1 = (lsb - 1);
+
+  // Round to nearest even.
+  remainder = (mantissa & lsb_m1);
+  mantissa >>= shift;
+  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+    ++mantissa;
+    if (!(mantissa & 0x3ff)) {
+      ++exponent;
+      mantissa = 0;
+    }
+  }
+
+  fp16 = (sign | (exponent << 10) | mantissa);
+
+  return fp16;
+}
+
+static float fp16_to_float(uint16_t fp16) {
+  unsigned sign = ((fp16 >> 15) & 1);
+  unsigned exponent = ((fp16 >> 10) & 0x1f);
+  unsigned mantissa = ((fp16 & 0x3ff) << 13);
+  int temp;
+  float fp32;
+  if (exponent == 0x1f) { /* NaN or Inf */
+    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+    exponent = 0xff;
+  } else if (!exponent) { /* Denorm or Zero */
+    if (mantissa) {
+      unsigned int msb;
+      exponent = 0x71;
+      do {
+        msb = (mantissa & 0x400000);
+        mantissa <<= 1; /* normalize */
+        --exponent;
+      } while (!msb);
+      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+    }
+  } else {
+    exponent += 0x70;
+  }
+  temp = ((sign << 31) | (exponent << 23) | mantissa);
+  std::memcpy(&fp32, &temp, sizeof(temp));
+  return fp32;
+}
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -44,6 +44,9 @@ __global__ void moe_align_block_size_kernel(

  for (size_t i = tid; i < numel; i += stride) {
    int expert_id = topk_ids[i];
+    if (expert_id >= num_experts) {
+      continue;
+    }
    int warp_idx = expert_id / experts_per_warp;
    int expert_offset = expert_id % experts_per_warp;
    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@ -95,12 +98,15 @@ template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    size_t numel) {
+    size_t numel, int32_t num_experts) {
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t stride = blockDim.x * gridDim.x;

  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
+    if (expert_id >= num_experts) {
+      continue;
+    }
    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
    sorted_token_ids[rank_post_pad] = i;
  }
@ -269,7 +275,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
-              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
+              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
        }
      });
 }
--- a/csrc/quantization/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh
@ -576,6 +576,17 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
          TORCH_CHECK(false,                                                   \
                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
        }                                                                      \
+      } else if (KV_DTYPE == "fp8_ds_mla") {                                   \
+        if (SRC_DTYPE == at::ScalarType::Float) {                              \
+          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);              \
+        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
+          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);           \
+        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
+          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);      \
+        } else {                                                               \
+          TORCH_CHECK(false,                                                   \
+                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
+        }                                                                      \
      } else {                                                                 \
        TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE);   \
      }                                                                        \
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -6,11 +6,11 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
                    const int64_t rows_per_block);

 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
-                       const c10::optional<at::Tensor>& in_bias,
+                       const std::optional<at::Tensor>& in_bias,
                       const int64_t CuCount);

 void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
-               const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
+               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount);

--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@ -1271,7 +1271,7 @@ int mindiv(int N, int div1, int div2) {
 }

 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
-                       const c10::optional<at::Tensor>& in_bias,
+                       const std::optional<at::Tensor>& in_bias,
                       const int64_t CuCount) {
  auto M_in = in_a.size(0);
  auto K_in = in_a.size(1);
@ -1729,7 +1729,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
 #endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support

 void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
-               const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
+               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount) {
  static c10::ScalarType kFp8Type = is_fp8_ocp()
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -713,6 +713,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
  cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
+
+  cache_ops.def(
+      "indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor "
+      "slot_mapping, "
+      "int quant_block_size, str kv_cache_dtype) -> ()");
+  cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
+                 &indexer_k_quant_and_cache);
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -404,6 +404,9 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
            fi
            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
+            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
+            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
            # Build AOT kernels
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                python3 -m flashinfer.aot
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -114,9 +114,6 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
-    sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
-    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
-    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu

 RUN --mount=type=cache,target=/root/.cache/uv \
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -65,8 +65,6 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
-ARG FA_BRANCH
-ARG FA_REPO
 RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
    pip install -r requirements.txt && git submodule update --init --recursive \
@ -77,14 +75,20 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install
+
+FROM base AS build_fa
+ARG FA_BRANCH
+ARG FA_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
-RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install \
-    && cp /app/flash-attention/dist/*.whl /app/install
+RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install

 FROM base AS build_aiter
 ARG AITER_BRANCH
@ -103,6 +107,8 @@ FROM base AS debs
 RUN mkdir /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
@ -111,13 +117,7 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs

 FROM base AS final
-RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
    pip install /install/*.whl

 ARG BASE_IMAGE
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -139,9 +139,9 @@ there is relatively little gain from TP. On the other hand, TP incurs significan
 overhead because of all-reduce being performed after every layer.

 Given this, it may be advantageous to instead shard the batched input data using TP, essentially
-performing batch-level DP. This has been shown to improve the throughput by around 10% for
+performing batch-level DP. This has been shown to improve the throughput and TTFT by around 10% for
 `tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations,
-batch-level DP can provide another 40% increase to throughput compared to regular TP.
+batch-level DP can provide another 40% improvement compared to regular TP.

 Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank,
 there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already.
@ -172,14 +172,15 @@ Batch-level DP needs to be implemented on a per-model basis,
 and enabled by setting `supports_encoder_tp_data = True` in the model class.
 Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.

-Known supported models:
+Known supported models (with corresponding benchmarks):

- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
+- dots_ocr (<gh-pr:25466>)
+- GLM-4.1V or above (<gh-pr:23168>)
 - InternVL (<gh-pr:23909>)
 - Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
- Qwen2.5-VL (<gh-pr:22742>)
+- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
 - Step3 (<gh-pr:22697>)

 ## Input Processing
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -6,6 +6,10 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup
    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
    and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.

+!!! tip
+    When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
+    This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
+
 ## Offline Inference

 To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 Install the NIXL library: `uv pip install nixl`, as a quick start.

 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files

 ### Transport Configuration

@ -154,6 +154,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \

 Refer to these example scripts in the vLLM repository:

- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
+- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -6,6 +6,17 @@ vLLM supports the generation of structured outputs using
 This document shows you some examples of the different options that are
 available to generate structured outputs.

+!!! warning
+    If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
+
+    - `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
+    - `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
+    - `guided_choice` -> `{"structured_outputs": {"choice": ...}}` or `StructuredOutputsParams(choice=...)`
+    - `guided_grammar` -> `{"structured_outputs": {"grammar": ...}}` or `StructuredOutputsParams(grammar=...)`
+    - `guided_whitespace_pattern` -> `{"structured_outputs": {"whitespace_pattern": ...}}` or `StructuredOutputsParams(whitespace_pattern=...)`
+    - `structural_tag` -> `{"structured_outputs": {"structural_tag": ...}}` or `StructuredOutputsParams(structural_tag=...)`
+    - `guided_decoding_backend` -> Remove this field from your request
+
 ## Online Serving (OpenAI API)

 You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -310,6 +310,15 @@ Flags:
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
 * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`

+### LongCat-Flash-Chat Models (`longcat`)
+
+Supported models:
+
+* `meituan-longcat/LongCat-Flash-Chat`
+* `meituan-longcat/LongCat-Flash-Chat-FP8`
+
+Flags: `--tool-call-parser longcat`
+
 ### GLM-4.5 Models (`glm45`)

 Supported models:
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -20,7 +20,80 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

--8<-- "docs/getting_started/installation/cpu/build.inc.md"
+Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```bash
+sudo apt-get update -y
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+Clone the vLLM project:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git vllm_source
+cd vllm_source
+```
+
+Install the required dependencies:
+
+```bash
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
+```
+
+??? console "pip"
+    ```bash
+    pip install --upgrade pip
+    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+
+Build and install vLLM:
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
+```
+
+If you want to develop vLLM, install it in editable mode instead.
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
+```
+
+Optionally, build a portable wheel which you can then install elsewhere:
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv build --wheel
+```
+
+```bash
+uv pip install dist/*.whl
+```
+
+??? console "pip"
+    ```bash
+    VLLM_TARGET_DEVICE=cpu python -m build --wheel --no-isolation
+    ```
+
+    ```bash
+    pip install dist/*.whl
+    ```
+
+!!! example "Troubleshooting"
+    - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
+    - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
+    - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
+    - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
+    ```toml title="pyproject.toml"
+    [build-system]
+    requires = [
+      "cmake>=3.26.1",
+      ...
+      "torch==X.Y.Z+cpu"   # <-------
+    ]
+    ```
+    - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.

 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
@ -57,4 +130,4 @@ docker run --rm \

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
-# --8<-- [end:extra-information]
+# --8<-- [end:extra-information]
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -32,8 +32,9 @@ def auto_mock(module, attr, max_mocks=50):
    for _ in range(max_mocks):
        try:
            # First treat attr as an attr, then as a submodule
-            return getattr(importlib.import_module(module), attr,
-                           importlib.import_module(f"{module}.{attr}"))
+            with patch("importlib.metadata.version", return_value="0.0.0"):
+                return getattr(importlib.import_module(module), attr,
+                               importlib.import_module(f"{module}.{attr}"))
        except importlib.metadata.PackageNotFoundError as e:
            raise e
        except ModuleNotFoundError as e:
@ -167,5 +168,5 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
        # Specify encoding for building on Windows
        with open(doc_path, "w", encoding="utf-8") as f:
-            f.write(parser.format_help())
+            f.write(super(type(parser), parser).format_help())
        logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL

 In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
-which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
+which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.

 ## Configuration

--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -29,7 +29,7 @@ _*Vision-language models currently accept only image inputs. Support for video i

 If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:

- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
+- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
    - Pipeline parallel
    - Tensor parallel
@ -428,6 +428,7 @@ th {
 | `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
 | `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
+| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ |

 Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!

--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@ -1,6 +1,6 @@
 # Using vLLM

-First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
+First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.

 Then, vLLM supports the following usage patterns:

--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@ -60,6 +60,12 @@ Key points from the PyTorch security guide:
 - Implement proper authentication and authorization for management interfaces
 - Follow the principle of least privilege for all system components

+### 4. **Restrict Domains Access for Media URLs:**
+
+Restrict domains that vLLM can access for media URLs by setting
+`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
+(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
+
 ## Security and Firewalls: Protecting Exposed vLLM Systems

 While vLLM is designed to allow unsafe network services to be isolated to
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@ -87,6 +87,7 @@ def main(args: dict):
            use_tqdm=False,
            chat_template=chat_template,
        )
+        print_outputs(outputs)


 if __name__ == "__main__":
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -54,6 +54,7 @@ def parse_args():
        "--method",
        type=str,
        default="eagle",
+        choices=["ngram", "eagle", "eagle3", "mtp"],
    )
    parser.add_argument("--num-spec-tokens", type=int, default=2)
    parser.add_argument("--prompt-lookup-max", type=int, default=5)
@ -118,9 +119,9 @@ def main(args):
            "prompt_lookup_max": args.prompt_lookup_max,
            "prompt_lookup_min": args.prompt_lookup_min,
        }
-    elif args.method.endswith("mtp"):
+    elif args.method == "mtp":
        speculative_config = {
-            "method": args.method,
+            "method": "mtp",
            "num_speculative_tokens": args.num_spec_tokens,
        }
    else:
--- a/examples/online_serving/dashboards/grafana/README.md
+++ b/examples/online_serving/dashboards/grafana/README.md
@ -11,9 +11,9 @@ vLLM performance and metrics.

 ## Dashboard Descriptions

- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
+- **performance_statistics.json**: Tracks performance metrics including latency and
  throughput for your vLLM service.
- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
+- **query_statistics.json**: Tracks query performance, request volume, and key
  performance indicators for your vLLM service.

 ## Deployment Options
--- a/examples/online_serving/dashboards/perses/README.md
+++ b/examples/online_serving/dashboards/perses/README.md
@ -21,9 +21,9 @@ deployment methods:

 ## Dashboard Descriptions

- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
+- **performance_statistics.yaml**: Performance metrics with aggregated latency
  statistics
- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
+- **query_statistics.yaml**: Query performance and deployment metrics

 ## Deployment Options

--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@ -1,12 +1,11 @@
-# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
-# see https://github.com/pytorch/pytorch/pull/151218
 cmake>=3.26.1
 ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.6.0+cpu
+torch==2.8.0+cpu; platform_machine == "x86_64"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
 wheel
 jinja2>=3.1.6
 regex
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
+torch==2.8.0+cpu; platform_machine == "x86_64"
 torch==2.8.0; platform_system == "Darwin"
 torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

@ -23,7 +23,7 @@ datasets # for benchmark scripts

 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
-intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
+intel_extension_for_pytorch==2.8.0; platform_machine == "x86_64"
 triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.

 # Use this to gather CPU info and optimize based on ARM Neoverse cores
--- a/setup.py
+++ b/setup.py
@ -322,6 +322,8 @@ class precompiled_wheel_utils:
                    "vllm/_C.abi3.so",
                    "vllm/_moe_C.abi3.so",
                    "vllm/_flashmla_C.abi3.so",
+                    "vllm/_flashmla_extension_C.abi3.so",
+                    "vllm/_sparse_flashmla_C.abi3.so",
                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                    "vllm/cumem_allocator.abi3.so",
@ -589,6 +591,8 @@ if _is_cuda():
        # not targeting a hopper system
        ext_modules.append(
            CMakeExtension(name="vllm._flashmla_C", optional=True))
+        ext_modules.append(
+            CMakeExtension(name="vllm._flashmla_extension_C", optional=True))
    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))

 if _build_custom_ops():
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@ -103,7 +103,7 @@ backend_configs = {
    # Triton Attention
    "TritonAttn":
    BackendConfig(name="TritonAttn",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
+                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
                  comp_config={
                      "cudagraph_mode": "FULL",
                  }),
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -191,7 +191,6 @@ class AttentionQuantPatternModel(torch.nn.Module):
                num_kv_heads=self.num_kv_heads,
                head_size=self.head_size,
                dtype=self.kv_cache_dtype,
-                use_mla=False,
            ),
            layer_names=[self.attn.layer_name],
            vllm_config=self.vllm_config,
@ -338,7 +337,7 @@ else:
@pytest.mark.parametrize("model_name, model_class", MODELS)
@pytest.mark.parametrize("backend",
                         [_Backend.FLASHINFER] if current_platform.is_cuda()
-                         else [_Backend.TRITON_ATTN_VLLM_V1])
+                         else [_Backend.TRITON_ATTN])
@pytest.mark.parametrize(
    "split_attention",
    [False, True] if current_platform.is_rocm() else [False])
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -50,8 +50,11 @@ def test_is_type(type_hint, type, expected):

@pytest.mark.parametrize(("type_hints", "type", "expected"), [
    ({float, int}, int, True),
+    ({int, tuple}, int, True),
    ({int, tuple[int]}, int, True),
+    ({int, tuple[int, ...]}, int, True),
    ({int, tuple[int]}, float, False),
+    ({int, tuple[int, ...]}, float, False),
    ({str, Literal["x", "y"]}, Literal, True),
 ])
 def test_contains_type(type_hints, type, expected):
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@ -15,7 +15,7 @@ from transformers import AutoConfig
 from ...utils import RemoteOpenAIServer

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "facebook/opt-125m"

 CONFIG = AutoConfig.from_pretrained(MODEL_NAME)

@ -27,7 +27,7 @@ def default_server_args() -> list[str]:
        "--dtype",
        "bfloat16",
        "--max-model-len",
-        "8192",
+        "2048",
        "--max-num-seqs",
        "128",
        "--enforce-eager",
@ -36,6 +36,27 @@ def default_server_args() -> list[str]:
    ]


+EXAMPLE_PROMPTS = [
+    "Hello, my name is",
+    "What is an LLM?",
+]
+
+
+def _encode_embeds(embeds: torch.Tensor):
+    buffer = io.BytesIO()
+    torch.save(embeds, buffer)
+    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+
+@pytest.fixture(scope="module")
+def example_prompt_embeds(hf_runner):
+    """Create example embeddings and return them as base64 encoded string."""
+    with hf_runner(MODEL_NAME) as hf_model:
+        example_embeddings = hf_model.get_prompt_embeddings(EXAMPLE_PROMPTS)
+
+    return [_encode_embeds(item) for item in example_embeddings]
+
+
@pytest.fixture(scope="module",
                params=["", "--disable-frontend-multiprocessing"])
 def server_with_prompt_embeds(default_server_args, request):
@ -52,21 +73,16 @@ async def client_with_prompt_embeds(server_with_prompt_embeds):
        yield async_client


-def create_dummy_embeds(num_tokens: int = 5) -> str:
-    """Create dummy embeddings and return them as base64 encoded string."""
-    dummy_embeds = torch.randn(num_tokens, CONFIG.hidden_size)
-    buffer = io.BytesIO()
-    torch.save(dummy_embeds, buffer)
-    return base64.b64encode(buffer.getvalue()).decode('utf-8')
-
-
-@pytest.mark.skip("This test is skipped because it is flaky.")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_completions_with_prompt_embeds(
-        client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
+    example_prompt_embeds,
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+    model_name: str,
+):
+    encoded_embeds, encoded_embeds2 = example_prompt_embeds
+
    # Test case: Single prompt embeds input
-    encoded_embeds = create_dummy_embeds()
    completion = await client_with_prompt_embeds.completions.create(
        model=model_name,
        prompt="",  # Add empty prompt as required parameter
@ -77,7 +93,6 @@ async def test_completions_with_prompt_embeds(
    assert completion.choices[0].prompt_logprobs is None

    # Test case: batch completion with prompt_embeds
-    encoded_embeds2 = create_dummy_embeds()
    completion = await client_with_prompt_embeds.completions.create(
        model=model_name,
        prompt="",  # Add empty prompt as required parameter
@ -89,7 +104,6 @@ async def test_completions_with_prompt_embeds(
    assert len(completion.choices[1].text) >= 1

    # Test case: streaming with prompt_embeds
-    encoded_embeds = create_dummy_embeds()
    single_completion = await client_with_prompt_embeds.completions.create(
        model=model_name,
        prompt="",  # Add empty prompt as required parameter
@ -117,7 +131,6 @@ async def test_completions_with_prompt_embeds(
    assert "".join(chunks) == single_output

    # Test case: batch streaming with prompt_embeds
-    encoded_embeds2 = create_dummy_embeds()
    stream = await client_with_prompt_embeds.completions.create(
        model=model_name,
        prompt="",  # Add empty prompt as required parameter
@ -139,7 +152,6 @@ async def test_completions_with_prompt_embeds(
    assert len(chunks_stream_embeds[1]) > 0

    # Test case: mixed text and prompt_embeds
-    encoded_embeds = create_dummy_embeds()
    completion_mixed = await client_with_prompt_embeds.completions.create(
        model=model_name,
        prompt="This is a prompt",
@ -184,10 +196,14 @@ async def test_completions_errors_with_prompt_embeds(
@pytest.mark.parametrize("logprobs_arg", [1, 0])
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_completions_with_logprobs_and_prompt_embeds(
-        client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int,
-        model_name: str):
+    example_prompt_embeds,
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+    logprobs_arg: int,
+    model_name: str,
+):
+    encoded_embeds, encoded_embeds2 = example_prompt_embeds
+
    # Test case: Logprobs using prompt_embeds
-    encoded_embeds = create_dummy_embeds()
    completion = await client_with_prompt_embeds.completions.create(
        model=model_name,
        prompt="",  # Add empty prompt as required parameter
@ -207,7 +223,6 @@ async def test_completions_with_logprobs_and_prompt_embeds(
    assert len(logprobs.tokens) == 5

    # Test case: Log probs with batch completion and prompt_embeds
-    encoded_embeds2 = create_dummy_embeds()
    completion = await client_with_prompt_embeds.completions.create(
        model=model_name,
        prompt="",  # Add empty prompt as required parameter
@ -232,9 +247,12 @@ async def test_completions_with_logprobs_and_prompt_embeds(

@pytest.mark.asyncio
 async def test_prompt_logprobs_raises_error(
-        client_with_prompt_embeds: openai.AsyncOpenAI):
+    example_prompt_embeds,
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+):
+    encoded_embeds, _ = example_prompt_embeds
+
    with pytest.raises(BadRequestError, match="not compatible"):
-        encoded_embeds = create_dummy_embeds()
        await client_with_prompt_embeds.completions.create(
            model=MODEL_NAME,
            prompt="",
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -45,6 +45,7 @@ class MockModelConfig:
    logits_processor_pattern: Optional[str] = None
    diff_sampling_param: Optional[dict] = None
    allowed_local_media_path: str = ""
+    allowed_media_domains: Optional[list[str]] = None
    encoder_config = None
    generation_config: str = "auto"
    skip_tokenizer_init: bool = False
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -68,7 +68,7 @@ def default_server_args(with_tool_parser: bool):
 def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
                  default_server_args: list[str]):
    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
                                default_server_args) as remote_server:
            yield remote_server
@ -240,6 +240,7 @@ class MockModelConfig:
    logits_processor_pattern = None
    diff_sampling_param: Optional[dict] = None
    allowed_local_media_path: str = ""
+    allowed_media_domains: Optional[list[str]] = None
    encoder_config = None
    generation_config: str = "auto"
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -19,6 +19,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                         parse_chat_messages,
                                         parse_chat_messages_futures,
                                         resolve_chat_template_content_format,
+                                         resolve_chat_template_kwargs,
                                         resolve_hf_chat_template)
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
@ -37,6 +38,7 @@ QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
+QWEN3_MODEL_ID = "Qwen/Qwen3-8B"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
 HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
 MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@ -2255,6 +2257,89 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
    assert isinstance(chat_template, str)


+@pytest.mark.parametrize(
+    "model, expected_kwargs",
+    [
+        (
+            QWEN2VL_MODEL_ID,
+            {
+                "add_vision_id", "add_generation_prompt",
+                "continue_final_message", "tools"
+            },
+        ),
+        (
+            QWEN3_MODEL_ID,
+            {
+                "enable_thinking", "add_generation_prompt",
+                "continue_final_message", "tools"
+            },
+        ),
+    ],
+)
+def test_resolve_hf_chat_template_kwargs(sample_json_schema, model,
+                                         expected_kwargs):
+    """checks that chat_template is a dict type for HF models."""
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    tools = ([{
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name",
+            "description": "This is a dummy function",
+            "parameters": sample_json_schema,
+        },
+    }])
+
+    chat_template_kwargs = {
+        # both unused
+        "unsed_kwargs_1": 123,
+        "unsed_kwargs_2": "abc",
+        # should not appear
+        "chat_template": "{% Hello world! %}",
+        # used by tokenizer
+        "continue_final_message": True,
+        "tools": tools,
+        # both used by Qwen2-VL and Qwen3
+        "add_generation_prompt": True,
+        # only used by Qwen2-VL
+        "add_vision_id": True,
+        # only used by Qwen3
+        "enable_thinking": True,
+    }
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
+
+    # Build the tokenizer
+    tokenizer = get_tokenizer(
+        model,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=tools,
+        model_config=model_config,
+    )
+    resolved_chat_template_kwargs = resolve_chat_template_kwargs(
+        tokenizer,
+        chat_template=chat_template,
+        chat_template_kwargs=chat_template_kwargs,
+    )
+    assert set(resolved_chat_template_kwargs.keys()) == expected_kwargs
+
+
 # NOTE: Qwen2-Audio default chat template is specially defined inside
 # processor class instead of using `tokenizer_config.json`
 # yapf: disable
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -31,7 +31,7 @@ DEVICE_MLA_BACKENDS = {
 }

 DEVICE_REGULAR_ATTN_BACKENDS = {
-    "cuda": ["XFORMERS", "FLASHINFER"],
+    "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
    "hip": ["ROCM_FLASH"],
    "cpu": ["TORCH_SDPA"],
 }
@ -86,7 +86,7 @@ def test_env(
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float16, None, block_size)
-            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            assert backend.get_name() == "TORCH_SDPA"

        elif device == "hip":
            with patch("vllm.attention.selector.current_platform",
@ -125,7 +125,7 @@ def test_env(
                                                   None,
                                                   block_size,
                                                   use_mla=use_mla)
-                        expected = f"{name}_VLLM_V1"
+                        expected = name
                        assert backend.get_name() == expected
                else:
                    backend = get_attn_backend(16,
@ -133,7 +133,7 @@ def test_env(
                                               None,
                                               block_size,
                                               use_mla=use_mla)
-                    expected = "TRITON_ATTN_VLLM_V1"
+                    expected = "TRITON_ATTN"
                    assert backend.get_name() == expected

        elif device == "cuda":
@ -160,7 +160,7 @@ def test_env(
                                                       None,
                                                       block_size,
                                                       use_mla=use_mla)
-                            expected = "CUTLASS_MLA_VLLM_V1"
+                            expected = "CUTLASS_MLA"
                            assert backend.get_name() == expected
                    elif name == "FLASHINFER_MLA":
                        if block_size not in [32, 64]:
@ -193,7 +193,7 @@ def test_env(
                                                           None,
                                                           block_size,
                                                           use_mla=use_mla)
-                                expected = f"{name}_VLLM_V1"
+                                expected = name
                                assert backend.get_name() == expected
                    elif name == "FLASH_ATTN_MLA":
                        backend = get_attn_backend(16,
@ -210,7 +210,7 @@ def test_env(
                                                   None,
                                                   block_size,
                                                   use_mla=use_mla)
-                        expected = "TRITON_MLA_VLLM_V1"
+                        expected = "TRITON_MLA"
                        assert backend.get_name() == expected
                elif name == "FLASHINFER":
                    backend = get_attn_backend(16,
@ -218,25 +218,24 @@ def test_env(
                                               None,
                                               block_size,
                                               use_mla=use_mla)
-                    expected = "FLASHINFER_VLLM_V1"
+                    expected = "FLASHINFER"
                    assert backend.get_name() == expected
-                else:
+                elif name == "XFORMERS":
                    backend = get_attn_backend(32,
                                               torch.float16,
                                               None,
                                               block_size,
                                               use_mla=use_mla)
-                    expected = "FLASH_ATTN_VLLM_V1"
+                    expected = "XFORMERS"
                    assert backend.get_name() == expected
-
-                    backend = get_attn_backend(16,
+                elif name == "FLASH_ATTN":
+                    backend = get_attn_backend(32,
                                               torch.float16,
                                               None,
                                               block_size,
                                               use_mla=use_mla)
-                    assert backend.get_name() == "FLEX_ATTENTION", (
-                        "Should fallback to FlexAttention if head size is "
-                        "not supported by FlashAttention")
+                    expected = "FLASH_ATTN"
+                    assert backend.get_name() == expected


@pytest.mark.parametrize("device", ["cpu", "cuda"])
@ -252,7 +251,7 @@ def test_fp32_fallback(
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            assert backend.get_name() == "TORCH_SDPA"

        elif device == "cuda":
            with patch("vllm.attention.selector.current_platform",
@ -266,6 +265,9 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    # TODO: When testing for v1, pipe in `use_v1` as an argument to
    # get_attn_backend

+    pytest.skip("Skipping as current backend selector does not " \
+                "handle fallbacks when a backend is set via env var.")
+
    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)

--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@ -593,6 +593,119 @@ def test_concat_and_cache_mla(
        torch.testing.assert_close(kv_cache, ref_kv_cache)


+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_concat_and_cache_ds_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    if dtype.itemsize != 2:
+        pytest.skip("ds_mla only supports 16-bit input")
+    kv_cache_dtype = "fp8_ds_mla"
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + (4 * 4) + (2 * qk_rope_head_dim)
+
+    scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks,
+                                 block_size,
+                                 entry_size,
+                                 dtype=torch.uint8,
+                                 kv_cache_dtype=kv_cache_dtype,
+                                 device=device)
+
+    ref_cache = torch.zeros_like(kv_cache, dtype=kv_cache.dtype)
+    tile_data = torch.zeros(128, dtype=dtype, device=device)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+
+        ref_cache_slice = ref_cache[block_idx, block_offset]
+        ref_cache_16bit = ref_cache_slice.view(dtype)
+        ref_cache_32bit = ref_cache_slice.view(torch.float32)
+
+        kv_c_data = kv_c[i]
+        for tile_idx in range(4):
+            tile_start = tile_idx * 128
+            tile_end = (tile_idx + 1) * 128
+            tile_data[:] = kv_c_data[tile_start:tile_end]
+
+            # tile_scale = tile_data.amax().to(torch.float32) / 448.
+            # NOTE: Using torch's amax() gives different results,
+            # so this must be manually computed.
+            tile_data_float = tile_data.to(torch.float32)
+            manual_max = abs(tile_data_float[0])
+            for j in range(1, 128):
+                manual_max = max(manual_max, abs(tile_data_float[j]))
+            tile_scale = manual_max / 448.
+
+            ref_cache_32bit[kv_lora_rank // 4 + tile_idx] = tile_scale
+
+            ops.convert_fp8(ref_cache_slice[tile_start:tile_end],
+                            tile_data,
+                            tile_scale.item(),
+                            kv_dtype="fp8")
+
+        for j in range(qk_rope_head_dim):
+            ref_cache_16bit[kv_lora_rank // 2 + 8 + j] = k_pe[i, j]
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        kv_cache_slice = kv_cache[block_idx, block_offset]
+        ref_cache_slice = ref_cache[block_idx, block_offset]
+
+        kv_nope = kv_cache_slice[:kv_lora_rank]
+        ref_nope = ref_cache_slice[:kv_lora_rank]
+        kv_scales = kv_cache_slice.view(torch.float32)[kv_lora_rank //
+                                                       4:kv_lora_rank // 4 + 4]
+        ref_scales = ref_cache_slice.view(
+            torch.float32)[kv_lora_rank // 4:kv_lora_rank // 4 + 4]
+        kv_rope = kv_cache_slice.view(dtype)[kv_lora_rank // 2 + 8:]
+        ref_rope = ref_cache_slice.view(dtype)[kv_lora_rank // 2 + 8:]
+
+        torch.testing.assert_close(kv_nope, ref_nope, atol=0.001, rtol=0.1)
+        torch.testing.assert_close(kv_scales, ref_scales, atol=0.001, rtol=0.1)
+        torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
+
+
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
--- a/tests/kernels/attention/test_deepgemm_attention.py
+++ b/tests/kernels/attention/test_deepgemm_attention.py
@ -0,0 +1,279 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils import cdiv, has_deep_gemm
+from vllm.utils.deep_gemm import (_ceil_to_ue8m0, calc_diff, fp8_mqa_logits,
+                                  fp8_paged_mqa_logits, get_num_sms,
+                                  get_paged_mqa_logits_metadata)
+
+
+def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor:
+    # x: (num_blocks, block_size, 1, head_dim)
+    num_blocks, block_size, num_heads, head_dim = x.shape
+    assert num_heads == 1
+    x_amax = x.abs().float().amax(dim=3, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    x_fp8 = torch.empty(
+        (num_blocks, block_size * (head_dim + 4)),
+        device=x.device,
+        dtype=torch.uint8,
+    )
+    x_fp8[:, :block_size * head_dim] = x_scaled.view(
+        num_blocks, block_size * head_dim).view(dtype=torch.uint8)
+    x_fp8[:,
+          block_size * head_dim:] = sf.view(num_blocks,
+                                            block_size).view(dtype=torch.uint8)
+    return x_fp8.view(num_blocks, block_size, num_heads, head_dim + 4)
+
+
+def per_custom_dims_cast_to_fp8(
+        x: torch.Tensor, dims: tuple,
+        use_ue8m0: bool) -> tuple[torch.Tensor, torch.Tensor]:
+    excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
+    x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled, sf.squeeze()
+
+
+def _generate_cp_test_data(seq_len: int, seq_len_kv: int):
+    assert seq_len_kv % seq_len == 0 and seq_len % 2 == 0
+    chunk_size = seq_len // 2
+    cp_size = seq_len_kv // seq_len
+    cp_id = cp_size // 3
+    ks = torch.zeros(seq_len, dtype=torch.int, device="cuda")
+    ke = torch.zeros(seq_len, dtype=torch.int, device="cuda")
+    for i in range(chunk_size):
+        ke[i] = cp_id * chunk_size + i
+        ke[i + chunk_size] = (cp_size * 2 - 1 - cp_id) * chunk_size + i
+    return ks, ke
+
+
+def _ref_fp8_mqa_logits(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+):
+    seq_len_kv = kv.shape[0]
+
+    k = kv
+    q = q.float()
+    k = k.float()
+
+    mask_lo = (torch.arange(0, seq_len_kv, device="cuda")[None, :]
+               >= cu_seqlen_ks[:, None])
+    mask_hi = (torch.arange(0, seq_len_kv, device="cuda")[None, :]
+               < cu_seqlen_ke[:, None])
+    mask = mask_lo & mask_hi
+
+    score = torch.einsum("mhd,and->hmn", q, k)
+    logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
+    logits = logits.masked_fill(~mask, float("-inf"))
+
+    return logits
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA only")
+@pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available")
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="SM90 and SM100 only")
+def test_deepgemm_fp8_mqa_logits():
+    torch.manual_seed(0)
+    random.seed(0)
+    num_heads, head_dim = 32, 128
+    for seq_len in (512, ):
+        for seq_len_kv in (1024, ):
+            for disable_cp in (False, True):
+                q = torch.randn(
+                    seq_len,
+                    num_heads,
+                    head_dim,
+                    device="cuda",
+                    dtype=torch.bfloat16,
+                )
+                kv = torch.randn(seq_len_kv,
+                                 head_dim,
+                                 device="cuda",
+                                 dtype=torch.bfloat16)
+                weights = torch.randn(seq_len,
+                                      num_heads,
+                                      device="cuda",
+                                      dtype=torch.float32)
+
+                if disable_cp:
+                    ks = torch.zeros(seq_len, dtype=torch.int, device="cuda")
+                    ke = torch.arange(seq_len, dtype=torch.int,
+                                      device="cuda") + (seq_len_kv - seq_len)
+                else:
+                    ks, ke = _generate_cp_test_data(seq_len, seq_len_kv)
+
+                q_fp8 = q.to(torch.float8_e4m3fn)
+                kv_fp8 = per_custom_dims_cast_to_fp8(kv, (0, ), False)
+                logits = fp8_mqa_logits(q_fp8, kv_fp8, weights, ks, ke)
+
+                ref_logits = _ref_fp8_mqa_logits(
+                    q=q,
+                    kv=kv,
+                    weights=weights,
+                    cu_seqlen_ks=ks,
+                    cu_seqlen_ke=ke,
+                )
+
+                ref_neginf_mask = ref_logits == float("-inf")
+                neginf_mask = logits == float("-inf")
+                assert torch.equal(neginf_mask, ref_neginf_mask)
+
+                ref_logits = ref_logits.masked_fill(ref_neginf_mask, 0)
+                logits = logits.masked_fill(neginf_mask, 0)
+                diff = calc_diff(logits, ref_logits)
+                assert diff < 1e-3, f"{diff=}"
+
+
+def _ref_fp8_paged_mqa_logits(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    max_model_len: int,
+):
+    batch_size, next_n, _, _ = q.size()
+    _, block_size, _, _ = kv_cache.size()
+    logits = torch.full(
+        [batch_size * next_n, max_model_len],
+        float("-inf"),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    context_lens_list = context_lens.tolist()
+    for i in range(batch_size):
+        context_len = context_lens_list[i]
+        q_offsets = torch.arange(context_len - next_n,
+                                 context_len,
+                                 device="cuda")
+        weight_slice = (weights[i * next_n:(i + 1) * next_n, :].transpose(
+            0, 1).contiguous())
+        for block_rk in range(cdiv(context_len, block_size)):
+            block_idx = block_tables[i][block_rk]
+            qx, kx = q[i], kv_cache[block_idx]
+            k_offsets = torch.arange(
+                block_rk * block_size,
+                (block_rk + 1) * block_size,
+                device="cuda",
+            )
+            mask = (k_offsets[None, :] < context_len) & (k_offsets[None, :]
+                                                         <= q_offsets[:, None])
+            s = torch.where(
+                mask[None, :, :],
+                (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to(
+                    logits.dtype),
+                float("-inf"),
+            )
+            s = torch.relu(s) * weight_slice[..., None]
+            s = s.sum(dim=0)
+            logits[
+                i * next_n:(i + 1) * next_n,
+                block_rk * block_size:(block_rk + 1) * block_size,
+            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s,
+                            float("-inf"))
+    return logits
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA only")
+@pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available")
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="SM90 and SM100 only")
+def test_deepgemm_fp8_paged_mqa_logits():
+    torch.manual_seed(0)
+    random.seed(0)
+
+    max_model_len = 4096
+    for batch_size, next_n in [(4, 1), (2, 2)]:
+        for heads, index_dim in [(32, 128)]:
+            for avg_kv in (2048, ):
+                num_blocks, blocksize = max_model_len * 2, 64
+
+                q = torch.randn(
+                    (batch_size, next_n, heads, index_dim),
+                    device="cuda",
+                    dtype=torch.bfloat16,
+                )
+                kv_cache = torch.randn(
+                    (num_blocks, blocksize, 1, index_dim),
+                    device="cuda",
+                    dtype=torch.bfloat16,
+                )
+                weights = torch.randn(
+                    (batch_size * next_n, heads),
+                    device="cuda",
+                    dtype=torch.float32,
+                )
+
+                context_lens = (torch.randint(int(0.8 * avg_kv),
+                                              int(1.2 * avg_kv),
+                                              (batch_size, )).cuda().to(
+                                                  torch.int32))
+                max_block_len = ((context_lens.max().item() + blocksize - 1) //
+                                 blocksize * blocksize)
+                block_tables = torch.zeros(
+                    (batch_size, max_block_len),
+                    device="cuda",
+                    dtype=torch.int32,
+                )
+
+                counter = 0
+                block_idx_pool = list(range(num_blocks))
+                random.shuffle(block_idx_pool)
+                for i in range(batch_size):
+                    ctx_len = int(context_lens[i].item())
+                    for j in range((ctx_len + blocksize - 1) // blocksize):
+                        block_tables[i][j] = block_idx_pool[counter]
+                        counter += 1
+
+                q_fp8 = q.to(torch.float8_e4m3fn)
+                kv_cache_fp8 = kv_cache_cast_to_fp8(kv_cache)
+
+                schedule_metadata = get_paged_mqa_logits_metadata(
+                    context_lens, blocksize, get_num_sms())
+                logits = fp8_paged_mqa_logits(
+                    q_fp8,
+                    kv_cache_fp8,
+                    weights,
+                    context_lens,
+                    block_tables,
+                    schedule_metadata,
+                    max_model_len,
+                )
+
+                ref_logits = _ref_fp8_paged_mqa_logits(
+                    q,
+                    kv_cache,
+                    weights,
+                    context_lens,
+                    block_tables,
+                    max_model_len,
+                )
+
+                positions = (torch.arange(max_model_len,
+                                          device="cuda").unsqueeze(0).expand(
+                                              batch_size * next_n, -1))
+                row_indices = (
+                    torch.arange(batch_size * next_n, device="cuda") // next_n)
+                next_n_offset = (
+                    torch.arange(batch_size * next_n, device="cuda") % next_n)
+                mask = positions <= (context_lens[row_indices] - next_n +
+                                     next_n_offset).unsqueeze(1)
+
+                logits = logits.masked_fill(~mask, 0)
+                ref_logits = ref_logits.masked_fill(~mask, 0)
+                diff = calc_diff(logits, ref_logits)
+                assert diff < 1e-3, f"{diff=}"
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@ -97,18 +97,16 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
        descale_k = None

    def flash_mla():
-        return flash_mla_with_kvcache(
-            q,
-            blocked_k,
-            block_table,
-            cache_seqlens,
-            dv,
-            tile_scheduler_metadata,
-            num_splits,
-            causal=causal,
-            descale_q=descale_q,
-            descale_k=descale_k,
-        )
+        return flash_mla_with_kvcache(q,
+                                      blocked_k,
+                                      block_table,
+                                      cache_seqlens,
+                                      dv,
+                                      tile_scheduler_metadata,
+                                      num_splits,
+                                      causal=causal,
+                                      descale_q=descale_q,
+                                      descale_k=descale_k)

    def scaled_dot_product_attention(query, key, value, is_causal=False):
        query = query.float()
--- a/tests/kernels/attention/test_flashmla_sparse.py
+++ b/tests/kernels/attention/test_flashmla_sparse.py
@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+
+def _cuda_sm90_available() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    major, _ = torch.cuda.get_device_capability()
+    return major == 9
+
+
+def test_sparse_flashmla_metadata_smoke():
+    import vllm.attention.ops.flashmla as fm
+    ok, reason = fm.is_flashmla_supported()
+    if not ok or not _cuda_sm90_available():
+        pytest.skip(reason or "SM90 not available")
+
+    device = torch.device("cuda")
+    batch_size = 1
+    seqlen_q = 1
+    num_heads_q = 128
+    num_heads_k = 1
+    q_seq_per_hk = seqlen_q * num_heads_q // num_heads_k
+    topk = 128
+
+    cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+
+    tile_md, num_splits = fm.get_mla_metadata(cache_seqlens,
+                                              q_seq_per_hk,
+                                              num_heads_k,
+                                              num_heads_q=num_heads_q,
+                                              topk=topk,
+                                              is_fp8_kvcache=True)
+    assert tile_md.dtype == torch.int32
+    assert num_splits.dtype == torch.int32
+
+
+def test_sparse_flashmla_decode_smoke():
+    import vllm.attention.ops.flashmla as fm
+    ok, reason = fm.is_flashmla_supported()
+    if not ok or not _cuda_sm90_available():
+        pytest.skip(reason or "SM90 not available")
+
+    device = torch.device("cuda")
+    batch_size = 1
+    seqlen_q = 1
+    num_heads_q = 1
+    head_dim_k = 576
+    head_dim_v = 512
+    num_heads_k = 1
+    page_block_size = 64
+    bytes_per_token = 656
+    topk = 128
+
+    # Metadata
+    q_seq_per_hk = seqlen_q * num_heads_q // num_heads_k
+    # q_heads_per_hk = num_heads_q // num_heads_k
+    cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+    tile_md, num_splits = fm.get_mla_metadata(cache_seqlens,
+                                              q_seq_per_hk,
+                                              num_heads_k,
+                                              num_heads_q=num_heads_q,
+                                              topk=topk,
+                                              is_fp8_kvcache=True)
+
+    # Inputs
+    q = torch.zeros((batch_size, seqlen_q, num_heads_q, head_dim_k),
+                    dtype=torch.bfloat16,
+                    device=device)
+    k_cache = torch.zeros((1, page_block_size, num_heads_k, bytes_per_token),
+                          dtype=torch.uint8,
+                          device=device)
+    indices = torch.zeros((batch_size, seqlen_q, topk),
+                          dtype=torch.int32,
+                          device=device)
+
+    block_table = torch.zeros((batch_size, 128),
+                              dtype=torch.int32,
+                              device=device)
+    out, lse = fm.flash_mla_with_kvcache(q,
+                                         k_cache,
+                                         block_table,
+                                         cache_seqlens,
+                                         head_dim_v,
+                                         tile_md,
+                                         num_splits,
+                                         indices=indices,
+                                         is_fp8_kvcache=True)
+    assert out.shape[0] == batch_size
+    assert out.shape[-1] == head_dim_v
+    assert lse.shape[0] == batch_size
+
+
+def test_sparse_flashmla_prefill_smoke():
+    import vllm.attention.ops.flashmla as fm
+    ok, reason = fm.is_flashmla_supported()
+    if not ok or not _cuda_sm90_available():
+        pytest.skip(reason or "SM90 not available")
+
+    device = torch.device("cuda")
+    s_q = 1
+    s_kv = 1
+    h_q = 64  # kernel expects multiple of 64
+    h_kv = 1
+    d_qk = 576
+    d_v = 512
+    topk = 128
+
+    q = torch.zeros((s_q, h_q, d_qk), dtype=torch.bfloat16, device=device)
+    kv = torch.zeros((s_kv, h_kv, d_qk), dtype=torch.bfloat16, device=device)
+    indices = torch.zeros((s_q, h_kv, topk), dtype=torch.int32, device=device)
+
+    out, max_logits, lse = fm.flash_mla_sparse_prefill(q, kv, indices, 1.0,
+                                                       d_v)
+    assert out.shape == (s_q, h_q, d_v)
+    assert max_logits.shape == (s_q, h_q)
+    assert lse.shape == (s_q, h_q)
--- a/tests/kernels/attention/test_pack_unpack_triton.py
+++ b/tests/kernels/attention/test_pack_unpack_triton.py
@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+from torch.testing import assert_close
+
+from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
+
+
+def test_pack_seq_basic_fp8():
+    """Test basic functionality of pack_seq_triton with fp8 and 3D tensors."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test cases with 3D tensors (N, H, D)
+    test_cases = [
+        (6, 8, 4, 2, [3, 3]),  # (6, 8, 4) -> (2, 3, 8, 4)
+        (10, 4, 8, 3, [2, 4, 4]),  # (10, 4, 8) -> (3, 4, 4, 8)
+        (20, 16, 32, 4, [5, 5, 5, 5]),  # (20, 16, 32) -> (4, 5, 16, 32)
+    ]
+
+    for N, H, D, B, lengths_list in test_cases:
+        # Create input tensor with small values for fp8
+        x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+        x = x.to(dtype=dtype)
+        lengths = torch.tensor(lengths_list, device=device)
+
+        # Pack the data
+        packed = pack_seq_triton(x, lengths)
+
+        # Check output shape and properties
+        expected_shape = (B, max(lengths_list), H, D)
+        assert packed.shape == expected_shape
+        assert packed.dtype == dtype
+        assert packed.device == x.device
+
+        # Check that valid data is preserved (within fp8 precision)
+        for b in range(B):
+            start_idx = sum(lengths_list[:b])
+            seq_len = lengths_list[b]
+
+            expected_data = x[start_idx:start_idx + seq_len].to(torch.float32)
+            actual_data = packed[b, :seq_len].to(torch.float32)
+
+            assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
+
+
+def test_pack_seq_custom_padding_fp8():
+    """Test pack_seq_triton with custom padding values for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    N, H, D, B = 20, 8, 16, 2
+    lengths = torch.tensor([10, 10], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+
+    # Test with different padding values
+    for pad_value in [-100.0, -10.0, 0.0, 10.0, 100.0]:
+        result = pack_seq_triton(x, lengths, pad_value=pad_value)
+
+        # Check valid data
+        for b in range(B):
+            start_idx = b * 10
+            expected_data = x[start_idx:start_idx + 10].to(torch.float32)
+            actual_data = result[b, :10].to(torch.float32)
+            assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
+
+        # Check padding (fp8 has limited range, so check for large values)
+        padded_data = result[:, 10:].to(torch.float32)
+        if pad_value < 0:
+            assert torch.all(padded_data < -50)  # Large negative values
+        elif pad_value > 0:
+            assert torch.all(padded_data > 50)  # Large positive values
+        else:
+            assert torch.allclose(padded_data,
+                                  torch.zeros_like(padded_data),
+                                  atol=1e-2)
+
+
+def test_pack_seq_default_negative_inf_padding_fp8():
+    """Test that pack_seq_triton uses -inf padding by default for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    # B = 2
+    N, H, D = 20, 8, 16
+    lengths = torch.tensor([10, 10], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    result = pack_seq_triton(x, lengths)
+
+    # Check that padding is large negative values (fp8 representation of -inf)
+    padded_data = result[:, 10:].to(torch.float32)
+    assert torch.all(
+        padded_data < -100)  # fp8 -inf is represented as large negative number
+
+
+def test_pack_seq_edge_cases_fp8():
+    """Test pack_seq_triton with edge cases for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test with single batch element
+    x = torch.randn(10, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([10], device=device)
+    result = pack_seq_triton(x, lengths)
+    assert result.shape == (1, 10, 8, 16)
+
+    # Test with very short sequences
+    x = torch.randn(20, 4, 8, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([1, 1, 1], device=device)
+    result = pack_seq_triton(x, lengths)
+    assert result.shape == (3, 1, 4, 8)
+
+    # Test with different sequence lengths
+    x = torch.randn(15, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([5, 7, 3], device=device)
+    result = pack_seq_triton(x, lengths)
+    assert result.shape == (3, 7, 8, 16)
+
+
+def test_pack_seq_different_block_sizes_fp8():
+    """Test pack_seq_triton with different block sizes for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    N, H, D, B = 100, 16, 32, 4
+    lengths = torch.tensor([25, 25, 25, 25], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+
+    # Test different block sizes
+    for block_t, block_d in [(32, 32), (64, 64), (128, 128)]:
+        result = pack_seq_triton(x, lengths, block_t=block_t, block_d=block_d)
+
+        assert result.shape == (B, 25, H, D)
+
+        # Check that valid data is preserved (within fp8 precision)
+        for b in range(B):
+            start_idx = b * 25
+            expected_data = x[start_idx:start_idx + 25].to(torch.float32)
+            actual_data = result[b, :25].to(torch.float32)
+            assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
+
+
+def test_pack_seq_shape_consistency():
+    """Test that pack_seq_triton maintains shape consistency."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    N, H, D, B = 20, 8, 16, 2
+    lengths = torch.tensor([10, 10], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+
+    result = pack_seq_triton(x, lengths)
+
+    # Check shape consistency
+    assert result.shape[0] == B  # Batch dimension
+    assert result.shape[1] == lengths.max().item()  # Max sequence length
+    assert result.shape[2:] == x.shape[1:]  # Feature dimensions preserved
+
+
+def test_pack_unpack_roundtrip_fp8():
+    """Test that pack -> unpack gives us back the original data for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test cases with 3D tensors
+    test_cases = [
+        (6, 8, 4, 2, [3, 3]),
+        (10, 4, 8, 3, [2, 4, 4]),
+        (20, 16, 32, 4, [5, 5, 5, 5]),
+        (15, 8, 16, 3, [7, 5, 3]),
+    ]
+
+    for N, H, D, B, lengths_list in test_cases:
+        # Create input tensor with small values for fp8
+        x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+        x = x.to(dtype=dtype)
+        lengths = torch.tensor(lengths_list, device=device)
+
+        # Pack the data
+        packed = pack_seq_triton(x, lengths)
+
+        # Unpack the data
+        unpacked = unpack_seq_triton(packed, lengths)
+
+        # Check that we get back the original data (within fp8 precision)
+        assert unpacked.shape == x.shape
+        x_f32 = x.to(torch.float32)
+        unpacked_f32 = unpacked.to(torch.float32)
+        assert_close(x_f32, unpacked_f32, rtol=1e-3, atol=1e-3)
+
+        # Unpack without explicit start locations (computed in kernel)
+        unpacked_with_loc = unpack_seq_triton(packed, lengths)
+        assert_close(x_f32,
+                     unpacked_with_loc.to(torch.float32),
+                     rtol=1e-3,
+                     atol=1e-2)
+
+
+def test_unpack_seq_triton_edge_cases_fp8():
+    """Test unpack function with edge cases for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test with single batch element
+    x = torch.randn(10, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([10], device=device)
+    packed = pack_seq_triton(x, lengths)
+    unpacked = unpack_seq_triton(packed, lengths)
+    assert unpacked.shape == x.shape
+    assert_close(x.to(torch.float32),
+                 unpacked.to(torch.float32),
+                 rtol=1e-1,
+                 atol=1e-2)
+
+    # Test with very short sequences
+    x = torch.randn(20, 4, 8, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([1, 1, 1], device=device)
+    packed = pack_seq_triton(x, lengths)
+    unpacked = unpack_seq_triton(packed, lengths)
+    # Only compare the first 3 elements that were actually packed
+    assert_close(x[:3].to(torch.float32),
+                 unpacked.to(torch.float32),
+                 rtol=1e-1,
+                 atol=1e-2)
+
+    x = torch.randn(15, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([5, 7, 3], device=device)
+    packed = pack_seq_triton(x, lengths)
+    unpacked = unpack_seq_triton(packed, lengths)
+    assert unpacked.shape == x.shape
+    assert_close(x.to(torch.float32),
+                 unpacked.to(torch.float32),
+                 rtol=1e-1,
+                 atol=1e-2)
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@ -28,7 +28,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
        # Test standard ROCm attention
        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
        assert (backend.get_name() == "ROCM_FLASH"
-                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
+                or backend.get_name() == "TRITON_ATTN")

        # MLA test for deepseek related

@ -40,8 +40,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
                                   16,
                                   False,
                                   use_mla=True)
-        assert (backend.get_name() == "TRITON_MLA"
-                or backend.get_name() == "TRITON_MLA_VLLM_V1")
+        assert backend.get_name() == "TRITON_MLA"

        # If attention backend is None
        # If use_mla is true
@ -53,8 +52,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
                                   16,
                                   False,
                                   use_mla=True)
-        assert (backend.get_name() == "TRITON_MLA"
-                or backend.get_name() == "TRITON_MLA_VLLM_V1")
+        assert backend.get_name() == "TRITON_MLA"

        # change the attention backend to AITER MLA
        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
@ -64,8 +62,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
                                   1,
                                   False,
                                   use_mla=True)
-        assert (backend.get_name() == "ROCM_AITER_MLA"
-                or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
+        assert backend.get_name() == "ROCM_AITER_MLA"

        # If attention backend is None
        # If use_mla is true
@ -79,5 +76,4 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
                                   1,
                                   False,
                                   use_mla=True)
-        assert (backend.get_name() == "ROCM_AITER_MLA"
-                or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
+        assert backend.get_name() == "ROCM_AITER_MLA"
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@ -46,6 +46,8 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
    # o will have the same shape as q
    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")

+    lse = torch.zeros(B, H_Q, dtype=dtype, device="cuda")
+
    b_seq_len = torch.full((B, ), seq_len, device="cuda")

    attn_logits = torch.empty(
@ -60,6 +62,7 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
        k_buffer,
        v_buffer,
        o,
+        lse,
        req_to_token,
        b_seq_len,
        attn_logits,
@ -72,12 +75,14 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
    v_buffer = v_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_V)

    o1 = torch.zeros_like(o)
+    lse1 = torch.zeros_like(lse)

    decode_attention_fwd(
        q,
        k_buffer,
        v_buffer,
        o1,
+        lse1,
        req_to_page,
        b_seq_len,
        attn_logits,
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@ -60,7 +60,7 @@ TENSORS_SHAPES_FN = [
@torch.inference_mode()
 def test_rotary_embedding(
    is_neox_style: bool,
-    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
+    tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
    batch_size: int,
    seq_len: int,
    num_heads: int,
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@ -7,7 +7,7 @@ import torch.nn.functional as F
 from einops import rearrange, repeat

 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
-    mamba_chunk_scan_combined)
+    mamba_chunk_scan_combined_varlen)
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.mamba2_attn import (
    _query_start_loc_to_chunk_indices_offsets)
@ -185,9 +185,14 @@ def generate_continuous_batched_examples(example_lens_by_batch,
            IND_S = [x % full_length for x in IND_E]
        IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]

+        # varlen has implicit batch=1
+        dt2 = dt2.squeeze(0)
+        X2 = X2.squeeze(0)
+        B2 = B2.squeeze(0)
+        C2 = C2.squeeze(0)
        yield ([Y_min[s, IND_S[s]:IND_E[s]]
                for s in range(num_examples)] if return_naive_ref else None,
-               cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
+               cu_seqlens, seq_idx, (A, dt2, X2, B2, C2))


@pytest.mark.parametrize("itype",
@ -198,7 +203,7 @@ def generate_continuous_batched_examples(example_lens_by_batch,
 def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
                                         itype):

-    # this tests the kernels on a single example (no batching)
+    # this tests the kernels on a single example (bs=1)

    # TODO: the bfloat16 case requires higher thresholds. To be investigated

@ -219,23 +224,40 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,

    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
                                                  B, C, chunk_size)
+
+    cu_seqlens = torch.tensor((0, seqlen), device='cuda').cumsum(dim=0)
+    seq_idx = torch.zeros(seqlen, dtype=torch.int32, device=cu_seqlens.device)
+
+    chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                cu_seqlens, chunk_size, cu_seqlens[-1])
+
+    # varlen has implicit batch=1
+    X = X.squeeze(0)
+    dt = dt.squeeze(0)
+    A = A.squeeze(0)
+    B = B.squeeze(0)
+    C = C.squeeze(0)
    Y = torch.empty_like(X)
-    final_state = mamba_chunk_scan_combined(X,
-                                            dt,
-                                            A,
-                                            B,
-                                            C,
-                                            chunk_size,
-                                            D=None,
-                                            return_final_states=True,
-                                            out=Y)
+    final_state = mamba_chunk_scan_combined_varlen(X,
+                                                   dt,
+                                                   A,
+                                                   B,
+                                                   C,
+                                                   chunk_size,
+                                                   D=None,
+                                                   cu_seqlens=cu_seqlens,
+                                                   seq_idx=seq_idx,
+                                                   chunk_indices=chunk_indices,
+                                                   chunk_offsets=chunk_offsets,
+                                                   out=Y)

    # just test the last in sequence
-    torch.testing.assert_close(Y[:, -1], Y_min[:, -1], atol=atol, rtol=rtol)
+    torch.testing.assert_close(Y[-1], Y_min[0, -1], atol=atol, rtol=rtol)

    # just test the last head
    # NOTE, in the kernel we always cast states to fp32
-    torch.testing.assert_close(final_state[:, -1],
+    torch.testing.assert_close(final_state[:, -1].to(torch.float32),
                               final_state_min[:, -1].to(torch.float32),
                               atol=atol,
                               rtol=rtol)
@ -300,7 +322,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
                cu_seqlens, chunk_size, cu_seqlens[-1])

        Y = torch.empty_like(X)
-        new_states = mamba_chunk_scan_combined(
+        new_states = mamba_chunk_scan_combined_varlen(
            X,
            dt,
            A,
@ -312,7 +334,6 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
            seq_idx=seq_idx,
            chunk_indices=chunk_indices,
            chunk_offsets=chunk_offsets,
-            return_varlen_states=True,
            initial_states=states,
            out=Y,
        )
@ -321,7 +342,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
        for i in range(num_examples):

            # just test one dim and dstate
-            Y_eg = Y[0, cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
+            Y_eg = Y[cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
            Y_min_eg = Y_min[i][:, 0, 0]
            torch.testing.assert_close(Y_eg, Y_min_eg, atol=atol, rtol=rtol)

@ -386,7 +407,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
            _query_start_loc_to_chunk_indices_offsets(
                cu_seqlens, chunk_size, cu_seqlens[-1])
    Y_ref = torch.empty_like(X)
-    state_ref = mamba_chunk_scan_combined(
+    state_ref = mamba_chunk_scan_combined_varlen(
        X,
        dt,
        A,
@ -398,7 +419,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
        seq_idx=seq_idx,
        chunk_indices=chunk_indices,
        chunk_offsets=chunk_offsets,
-        return_varlen_states=True,
        initial_states=None,
        out=Y_ref,
    )
@ -414,27 +434,27 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
    chunked_seq_idx = torch.repeat_interleave(
        torch.arange(len(chunked_seqlens), device=device),
        chunked_seqlens,
-        output_size=chunked_cu_seqlens[-1]).unsqueeze(0).to(torch.int32)
+        output_size=chunked_cu_seqlens[-1]).to(torch.int32)
    chunked_input_seq_len = chunked_cu_seqlens[-1]
-    X_chunked = torch.zeros_like(X)[:, :chunked_input_seq_len, ...]
-    dt_chunked = torch.zeros_like(dt)[:, :chunked_input_seq_len, ...]
-    B_chunked = torch.zeros_like(B)[:, :chunked_input_seq_len, ...]
-    C_chunked = torch.zeros_like(C)[:, :chunked_input_seq_len, ...]
+    X_chunked = torch.zeros_like(X)[:chunked_input_seq_len, ...]
+    dt_chunked = torch.zeros_like(dt)[:chunked_input_seq_len, ...]
+    B_chunked = torch.zeros_like(B)[:chunked_input_seq_len, ...]
+    C_chunked = torch.zeros_like(C)[:chunked_input_seq_len, ...]
    for i in range(num_sequences):
        # fmt: off
-        chunk_f = lambda x, i: x[:, cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...]  # noqa: E501
+        chunk_f = lambda x, i: x[cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...]  # noqa: E501

-        X_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i)  # noqa: E501
-        dt_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i)  # noqa: E501
-        B_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i)  # noqa: E501
-        C_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i)  # noqa: E501
+        X_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i)  # noqa: E501
+        dt_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i)  # noqa: E501
+        B_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i)  # noqa: E501
+        C_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i)  # noqa: E501
        # fmt: on

    chunk_indices, chunk_offsets = \
            _query_start_loc_to_chunk_indices_offsets(
                chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1])
    Y_partial = torch.empty_like(X_chunked)
-    partial_state = mamba_chunk_scan_combined(
+    partial_state = mamba_chunk_scan_combined_varlen(
        X_chunked,
        dt_chunked,
        A,
@ -446,7 +466,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
        seq_idx=chunked_seq_idx,
        chunk_indices=chunk_indices,
        chunk_offsets=chunk_offsets,
-        return_varlen_states=True,
        initial_states=None,
        out=Y_partial,
    )
@ -461,29 +480,28 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
    remaining_chunked_seq_idx = torch.repeat_interleave(
        torch.arange(len(remaining_chunked_seqlens), device=device),
        remaining_chunked_seqlens,
-        output_size=remaining_chunked_cu_seqlens[-1]).unsqueeze(0).to(
-            torch.int32)
+        output_size=remaining_chunked_cu_seqlens[-1]).to(torch.int32)
    remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
    # fmt: off
-    remaining_X_chunked = torch.zeros_like(X)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
-    remaining_dt_chunked = torch.zeros_like(dt)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
-    remaining_B_chunked = torch.zeros_like(B)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
-    remaining_C_chunked = torch.zeros_like(C)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_X_chunked = torch.zeros_like(X)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_dt_chunked = torch.zeros_like(dt)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_B_chunked = torch.zeros_like(B)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_C_chunked = torch.zeros_like(C)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
    for i in range(num_sequences):
-        remaining_chunk_f = lambda x, i: x[:, cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...]  # noqa: E501
+        remaining_chunk_f = lambda x, i: x[cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...]  # noqa: E501

-        remaining_X_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i)  # noqa: E501
-        remaining_dt_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i)  # noqa: E501
-        remaining_B_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i)  # noqa: E501
-        remaining_C_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i)  # noqa: E501
+        remaining_X_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i)  # noqa: E501
+        remaining_dt_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i)  # noqa: E501
+        remaining_B_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i)  # noqa: E501
+        remaining_C_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i)  # noqa: E501

    # assert input chunking is correct
    concat_chunk_f = lambda pt1, pt2, i: torch.cat([
-        pt1[:,chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
-        pt2[:,remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
+        pt1[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
+        pt2[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
        ],
-        dim=1)
-    concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=1)  # noqa: E501
+        dim=0)
+    concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=0)  # noqa: E501
    # fmt: on

    assert concat_batch_f(X_chunked, remaining_X_chunked).equal(X)
@ -498,7 +516,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
                remaining_chunked_cu_seqlens[-1])

    Y_chunked = torch.empty_like(remaining_X_chunked)
-    state_chunked = mamba_chunk_scan_combined(
+    state_chunked = mamba_chunk_scan_combined_varlen(
        remaining_X_chunked,
        remaining_dt_chunked,
        A,
@ -510,7 +528,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
        seq_idx=remaining_chunked_seq_idx,
        chunk_indices=chunk_indices,
        chunk_offsets=chunk_offsets,
-        return_varlen_states=True,
        initial_states=partial_state,
        out=Y_chunked,
    )
@ -518,17 +535,17 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):

    # kernel chunked is same as kernel overall
    for i in range(num_sequences):
-        Y_seq = Y[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
-        Y_ref_seq = Y_ref[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        Y_seq = Y[cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        Y_ref_seq = Y_ref[cu_seqlens[i]:cu_seqlens[i + 1], ...]
        torch.testing.assert_close(
-            Y_seq[:, :chunked_seqlens[i], ...],
-            Y_ref_seq[:, :chunked_seqlens[i], ...],
+            Y_seq[:chunked_seqlens[i], ...],
+            Y_ref_seq[:chunked_seqlens[i], ...],
            atol=atol,
            rtol=rtol,
            msg=lambda x: f"seq{i} output part1 " + x)  # noqa: B023
        torch.testing.assert_close(
-            Y_seq[:, chunked_seqlens[i]:, ...],
-            Y_ref_seq[:, chunked_seqlens[i]:, ...],
+            Y_seq[chunked_seqlens[i]:, ...],
+            Y_ref_seq[chunked_seqlens[i]:, ...],
            atol=atol,
            rtol=rtol,
            msg=lambda x: f"seq{i} output part2 " + x)  # noqa: B023
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@ -222,7 +222,8 @@ if (has_flashinfer_cutlass_fused_moe()
    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
        FlashInferExperts)
    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-        FlashInferCutlassMoEPrepareAndFinalize)
+        FlashInferCutlassMoEPrepareAndFinalize,
+        create_flashinfer_prepare_finalize)

    register_prepare_and_finalize(
        FlashInferCutlassMoEPrepareAndFinalize,
@ -373,7 +374,7 @@ def make_prepare_finalize(
        assert prepare_finalize is not None
        return prepare_finalize
    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
-        return FlashInferCutlassMoEPrepareAndFinalize(
+        return create_flashinfer_prepare_finalize(
            use_dp=moe.moe_parallel_config.dp_size > 1)
    else:
        return MoEPrepareAndFinalizeNoEP()
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@ -138,7 +138,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)

        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=td.hidden_states,
            router_logits=score,
            use_grouped_topk=False,
@ -206,7 +206,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)

        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=td.hidden_states,
            router_logits=score,
            use_grouped_topk=False,
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@ -11,11 +11,12 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, get_col_major_tma_aligned_tensor,
-    per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
+    cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
+from vllm.utils.deep_gemm import (fp8_gemm_nt,
+                                  get_col_major_tma_aligned_tensor,
+                                  per_block_cast_to_fp8)

 if current_platform.get_device_capability() < (9, 0):
    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
@ -90,8 +91,7 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):

    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
                                       out_dtype)
-    out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
-                                      out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)

    rel_diff = (torch.mean(
        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@ -20,11 +20,9 @@ from vllm.platforms import current_platform
        (8, 513, 64),  # Non-divisible (native only)
    ])
@pytest.mark.parametrize("seed", [42])
-@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
 def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
-                                      group_size: int, seed: int,
-                                      use_ue8m0: bool) -> None:
+                                      group_size: int, seed: int) -> None:
    """Test QuantFP8 group quantization with various configurations.

    Tests both CUDA and native implementations, column-major scales,
@ -40,8 +38,7 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
    group_shape = GroupShape(1, group_size)
    quant_op = QuantFP8(static=False,
                        group_shape=group_shape,
-                        column_major_scales=False,
-                        use_ue8m0=use_ue8m0)
+                        column_major_scales=False)

    # 1. Test native implementation (always available)
    x_quant_native, scales_native = quant_op.forward_native(x.clone())
@ -51,15 +48,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
    # 2. Test column-major scales configuration
    quant_op_col = QuantFP8(static=False,
                            group_shape=group_shape,
-                            column_major_scales=True,
-                            use_ue8m0=use_ue8m0)
+                            column_major_scales=True)
    _, scales_col = quant_op_col.forward_native(x.clone())
-    assert scales_col.shape == (batch_size, expected_num_groups)
-    assert scales_col.stride(0) == 1
-    assert scales_col.stride(1) == batch_size
-
-    # Test column-major scales consistency
-    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
+    assert scales_col.shape == (expected_num_groups, batch_size)

    # 3. Test CUDA implementation (only for divisible dimensions)
    if is_divisible:
@ -77,9 +68,8 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,


@pytest.mark.parametrize("seed", [42])
-@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
-def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
+def test_quantfp8_group_multidimensional(seed: int) -> None:
    current_platform.seed_everything(seed)

    group_size = 64
@ -92,8 +82,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
    group_shape = GroupShape(1, group_size)
    quant_op = QuantFP8(static=False,
                        group_shape=group_shape,
-                        column_major_scales=False,
-                        use_ue8m0=use_ue8m0)
+                        column_major_scales=False)

    x_quant, scales = quant_op.forward_native(x_3d.clone())
    assert x_quant.shape == x_3d.shape
@ -102,8 +91,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
    # Test column_major_scales with multi-dim
    quant_op_col = QuantFP8(static=False,
                            group_shape=group_shape,
-                            column_major_scales=True,
-                            use_ue8m0=use_ue8m0)
+                            column_major_scales=True)
    _, scales_col = quant_op_col.forward_native(x_3d.clone())
    assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)

--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@ -165,7 +165,7 @@ def onednn_gemm_test_helper(primitive_cache_size: int,
 def test_onednn_int8_scaled_gemm(
    n: int,
    k: int,
-    m_list: tuple[int],
+    m_list: tuple[int, ...],
    per_tensor_a_scale: bool,
    per_tensor_b_scale: bool,
    use_bias: bool,
@ -196,7 +196,7 @@ def test_onednn_int8_scaled_gemm(
 def test_onednn_gemm(
    n: int,
    k: int,
-    m_list: tuple[int],
+    m_list: tuple[int, ...],
    use_bias: bool,
    use_stride: bool,
    dtype: torch.dtype,
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@ -524,14 +524,14 @@ def make_backend(backend_name: str) -> AttentionBackend:

    * Backend instance
    '''
-    if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
+    if backend_name == STR_XFORMERS_ATTN_VAL:
        from vllm.v1.attention.backends.xformers import (
            XFormersAttentionBackend)
        return XFormersAttentionBackend()
-    if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
+    if backend_name == STR_FLASH_ATTN_VAL:
        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
        return FlashAttentionBackend()
-    if backend_name == "TRITON_ATTN_VLLM_V1":
+    if backend_name == "TRITON_ATTN":
        from vllm.v1.attention.backends.triton_attn import (
            TritonAttentionBackend)
        return TritonAttentionBackend()
@ -539,7 +539,7 @@ def make_backend(backend_name: str) -> AttentionBackend:
        from vllm.v1.attention.backends.flex_attention import (
            FlexAttentionBackend)
        return FlexAttentionBackend()
-    if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
+    if backend_name == "TORCH_SDPA":
        from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
        return TorchSDPABackend()
    if backend_name == "FLASHINFER":
--- a/tests/model_executor/model_loader/test_sharded_state_loader.py
+++ b/tests/model_executor/model_loader/test_sharded_state_loader.py
@ -91,8 +91,7 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@pytest.mark.parametrize("enable_lora", [False, True])
@pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_3p2_1b_files,
-                              monkeypatch: pytest.MonkeyPatch):
+                              llama_3p2_1b_files):
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@ -17,6 +17,8 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
 from vllm.model_executor.layers.layernorm import (RMSNorm,
                                                  dispatch_rocm_rmsnorm_func,
                                                  fused_add_rms_norm, rms_norm)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform

 RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
@ -109,6 +111,34 @@ def test_enabled_ops_invalid(env: str):
            RMSNorm(1024).enabled()


+@pytest.mark.skipif(
+    not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
+    reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
+@pytest.mark.parametrize("use_cutlass", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
+def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
+                                  use_rocm_aiter_gemm_w8a8_blockscale: str,
+                                  monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
+                       use_rocm_aiter_gemm_w8a8_blockscale)
+
+    use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
+        int(use_rocm_aiter_gemm_w8a8_blockscale)))
+    block_scale_func = dispatch_w8a8_blockscale_func(
+        use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
+    if use_cutlass:
+        assert block_scale_func == cutlass_scaled_mm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_gemm_w8a8_blockscale):
+        assert block_scale_func == (
+            torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
+    else:
+        assert block_scale_func == w8a8_block_fp8_matmul
+
+
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.multimodal.video import sample_frames_from_video
+
+from ....conftest import VIDEO_ASSETS
+
+models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
+target_dtype = "bfloat16"
+
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+def qwen2_5_vl_chat_template(*query):
+    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "baby_reading":
+    qwen2_5_vl_chat_template(
+        VIDEO_PLACEHOLDER,
+        "Describe this video with a short sentence ",
+        "(no more than 20 words)",
+    ),
+})
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
+@pytest.mark.parametrize("num_frames", [16])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
+                                      video_pruning_rate: float,
+                                      num_frames: int, dtype: str,
+                                      max_tokens: int) -> None:
+    """Test EVS (Efficient Video Sampling) functionality with different
+    pruning rates.
+    """
+
+    # Sample frames from video assets
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    prompts = [VIDEO_PROMPTS[0]]
+    videos = [sampled_vids[0]]
+
+    # Initialize model with EVS configuration
+    with vllm_runner(model,
+                     runner="generate",
+                     max_model_len=4000,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"video": 1},
+                     tensor_parallel_size=1,
+                     video_pruning_rate=video_pruning_rate) as vllm_model:
+
+        # Generate output - this should not crash
+        outputs = vllm_model.generate_greedy(prompts,
+                                             max_tokens,
+                                             videos=videos)
+
+        # Basic validation that we got a response
+        assert len(outputs) == 1
+        output_ids, output_text = outputs[0]
+
+        # Ensure we got some output
+        assert len(output_ids) > 0
+        assert len(output_text) > 0
+
+        # Ensure the output is a string
+        assert isinstance(output_text, str)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
+@pytest.mark.parametrize("num_frames", [16])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
+                                       video_pruning_rate: float,
+                                       num_frames: int, dtype: str,
+                                       max_tokens: int) -> None:
+    """Test EVS functionality with batched videos.
+
+    This test validates that:
+    1. The model handles batched video inputs correctly with EVS
+    2. Both pruning configurations work with multiple videos
+    3. The model doesn't crash when processing multiple videos simultaneously
+    """
+    # Sample frames from video assets
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    # Test batched videos
+    prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
+    videos = [sampled_vids[0],
+              sampled_vids[0]]  # Use same video twice for testing
+
+    # Initialize model with EVS configuration
+    with vllm_runner(model,
+                     runner="generate",
+                     max_model_len=4000,
+                     max_num_seqs=2,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"video": 2},
+                     tensor_parallel_size=1,
+                     video_pruning_rate=video_pruning_rate) as vllm_model:
+
+        # Generate output - this should not crash
+        outputs = vllm_model.generate_greedy(prompts,
+                                             max_tokens,
+                                             videos=videos)
+
+        # Basic validation that we got responses for both videos
+        assert len(outputs) == 2
+
+        for output_ids, output_text in outputs:
+            # Ensure we got some output for each video
+            assert len(output_ids) > 0
+            assert len(output_text) > 0
+
+            # Ensure the output is a string
+            assert isinstance(output_text, str)
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@ -101,7 +101,7 @@ class VLMTestInfo(NamedTuple):
    # Function for converting ImageAssets to image embeddings;
    # We need to define this explicitly for embedding tests
    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
-                                                    torch.Tensor]] = None
+                                                    list[torch.Tensor]]] = None

    # Exposed options for vLLM runner; we change these in a several tests,
    # but the defaults are derived from VllmRunner & the engine defaults
@ -137,12 +137,12 @@ class VLMTestInfo(NamedTuple):
    # Default expandable params per test; these defaults can be overridden in
    # instances of this object; the complete set of test cases for the model
    # is all combinations of .models + all fields below
-    max_tokens: Union[int, tuple[int]] = 128
-    num_logprobs: Union[int, tuple[int]] = 5
-    dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
-    distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
+    max_tokens: int = 128
+    num_logprobs: int = 5
+    dtype: str = "auto"
+    distributed_executor_backend: Optional[str] = None
    # Only expanded in video tests
-    num_video_frames: Union[int, tuple[int]] = 16
+    num_video_frames: int = 16

    # Fixed image sizes / image size factors; most tests use image_size_factors
    # The values provided for these two fields will be stacked and expanded
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -213,6 +213,7 @@ _IGNORE_MM_KEYS = {
 MM_DATA_PATCHES = {
    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
    "glm4v": glm4_1v_patch_mm_data,
+    "glm4v_moe": glm4_1v_patch_mm_data,
    "qwen3_vl": qwen3_vl_patch_mm_data,
    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
 }
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -19,6 +19,8 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.models.interfaces import (SupportsMultiModal,
+                                                   supports_multimodal)
 from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        InputProcessingContext)
@ -88,6 +90,7 @@ def resize_mm_data(


 def create_batched_mm_kwargs(
+    model_cls: type[SupportsMultiModal],
    model_config: ModelConfig,
    processor: BaseMultiModalProcessor,
    size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
@ -127,16 +130,22 @@ def create_batched_mm_kwargs(
        mm_data=resized_mm_data,
        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
        tokenization_kwargs=processor_inputs.tokenization_kwargs,
-    )["mm_kwargs"]
+    )["mm_kwargs"].require_data()
    items = [
        item for modality in supported_mm_limits
        for item in mm_kwargs[modality]
    ]
-    return group_mm_kwargs_by_modality(items)
+    return group_mm_kwargs_by_modality(
+        items,
+        merge_by_field_config=model_cls.merge_by_field_config,
+    )


@contextmanager
-def initialize_dummy_model(model_cls: nn.Module, model_config: ModelConfig):
+def initialize_dummy_model(
+    model_cls: type[nn.Module],
+    model_config: ModelConfig,
+):
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
@ -198,8 +207,12 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
        hf_overrides=hf_overrides_fn,
        skip_tokenizer_init=model_info.skip_tokenizer_init,
        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
+
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    assert supports_multimodal(model_cls)
+
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]

    inputs_parse_methods = []
@ -228,7 +241,7 @@ def test_model_tensor_schema(model_arch: str, model_id: str):

    with initialize_dummy_model(model_cls, model_config) as model:
        for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                model_config, processor):
+                model_cls, model_config, processor):
            for method_name in inputs_parse_methods:
                print(f"Testing `{method_name}` with modality={modality} "
                      f"and mm_kwargs{list(mm_kwargs.keys())}")
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -196,6 +196,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
                                         trust_remote_code=True),
+    "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", # noqa: E501
+                                      trust_remote_code=True,
+                                      is_available_online=False),
    "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
    "DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
                                         trust_remote_code=True),
@ -204,6 +207,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
                                         trust_remote_code=True),
+    "DeepseekV32ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3.2-Exp"),
    "Ernie4_5ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT",
                                            min_transformers_version="4.54"),
    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
@ -273,6 +277,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                        is_available_online=False),
    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                         is_available_online=False),
+    "LongcatFlashForCausalLM": _HfExamplesInfo
+                ("meituan-longcat/LongCat-Flash-Chat", trust_remote_code=True),
    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
                                         min_transformers_version="4.55.3",
@ -526,7 +532,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                              trust_remote_code=True),
    "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
                                                     trust_remote_code=True),
-    "NemotronH_Nano_VL": _HfExamplesInfo("nano_vl_dummy",
+    "NemotronH_Nano_VL_V2": _HfExamplesInfo("nano_vl_dummy",
                                          is_available_online=False,
                                          trust_remote_code=True),
    "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
@ -639,6 +645,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                        speculative_model="zai-org/GLM-4.5",
                                        min_transformers_version="4.54",
                                        is_available_online=False),
+    "LongCatFlashMTPModel": _HfExamplesInfo(
+        "meituan-longcat/LongCat-Flash-Chat",
+        trust_remote_code=True,
+        speculative_model="meituan-longcat/LongCat-Flash-Chat"),
    "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                    trust_remote_code=True,
                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -8,7 +8,8 @@ import pytest

 from vllm import LLM
 from vllm.utils import GiB_bytes
-from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
+from vllm.v1.core.kv_cache_utils import (generate_scheduler_kv_cache_config,
+                                         get_kv_cache_configs)
 from vllm.v1.engine.core import EngineCore as V1EngineCore

 from ..utils import create_new_process_for_each_test
@ -62,11 +63,13 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
    # Avoid calling model.forward()
    def _initialize_kv_caches_v1(self, vllm_config):
        kv_cache_specs = self.model_executor.get_kv_cache_specs()
-        scheduler_kv_cache_config = get_kv_cache_configs(
+        kv_cache_configs = get_kv_cache_configs(
            vllm_config,
            kv_cache_specs,
            [10 * GiB_bytes],
-        )[0]
+        )
+        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
+            kv_cache_configs)

        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
        return 1, 0, scheduler_kv_cache_config
@ -84,7 +87,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
            # L4 supports FA3.
-            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
        if model_arch == "WhisperForConditionalGeneration":
            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
        LLM(
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -428,9 +428,8 @@ def dummy_hf_overrides(
        num_hidden_layers = (3 if model_arch
                             == "Gemma3nForConditionalGeneration" else 1)

-    text_config.update({
+    update_dict = {
        "num_layers": num_layers,
-        "num_hidden_layers": num_hidden_layers,
        "num_experts": num_experts,
        "num_experts_per_tok": 2,
        "num_local_experts": num_experts,
@ -440,7 +439,14 @@ def dummy_hf_overrides(
        "n_routed_experts": num_experts,
        # For Gemma-3n
        "num_kv_shared_layers": 1,
-    })
+    }
+
+    # Update num_hidden_layers for non-Longcat architectures
+    if model_arch != "LongcatFlashForCausalLM" \
+            and model_arch != "LongCatFlashMTPModel":
+        update_dict["num_hidden_layers"] = num_hidden_layers
+
+    text_config.update(update_dict)

    if hasattr(hf_config, "vision_config"):
        hf_config.vision_config.update({
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@ -5,7 +5,6 @@ import base64
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import TYPE_CHECKING, NamedTuple

 import numpy as np
 import pytest
@ -15,9 +14,6 @@ from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import MediaConnector, argsort_mm_positions

-if TYPE_CHECKING:
-    from vllm.multimodal.inputs import MultiModalPlaceholderDict
-
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
@ -70,7 +66,12 @@ async def test_fetch_image_http(image_url: str):
@pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: dict[str, Image.Image],
                                  raw_image_url: str, suffix: str):
-    connector = MediaConnector()
+    connector = MediaConnector(
+        # Domain restriction should not apply to data URLs.
+        allowed_media_domains=[
+            "www.bogotobogo.com",
+            "github.com",
+        ])
    url_image = url_images[raw_image_url]

    try:
@ -218,18 +219,13 @@ async def test_fetch_video_http_with_dynamic_loader(
        assert metadata_sync["video_backend"] == "opencv_dynamic"


-# Used for `test_argsort_mm_positions`.
-class TestCase(NamedTuple):
-    mm_positions: "MultiModalPlaceholderDict"
-    expected_modality_idxs: list[tuple[str, int]]
-
-
-def test_argsort_mm_positions():
-
-    test_cases = [
+# yapf: disable
+@pytest.mark.parametrize(
+    "case",
+    [
        # Single modality
        ## Internally sorted
-        TestCase(
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
@ -242,7 +238,7 @@ def test_argsort_mm_positions():
            ],
        ),
        ## Internally unsorted
-        TestCase(
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=3, length=2),
@ -257,7 +253,7 @@ def test_argsort_mm_positions():

        # Two modalities
        ## Internally sorted
-        TestCase(
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
@ -276,7 +272,7 @@ def test_argsort_mm_positions():
            ],
        ),
        ## Interleaved, internally sorted
-        TestCase(
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
@ -295,7 +291,7 @@ def test_argsort_mm_positions():
            ],
        ),
        ## Interleaved, internally unsorted
-        TestCase(
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=8, length=2),
@ -316,7 +312,7 @@ def test_argsort_mm_positions():

        # Three modalities
        ## Internally sorted
-        TestCase(
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
@ -341,7 +337,7 @@ def test_argsort_mm_positions():
            ],
        ),
        ## Interleaved, internally sorted
-        TestCase(
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
@ -363,8 +359,8 @@ def test_argsort_mm_positions():
                ("image", 2),
            ],
        ),
-        ## Interleaved, internally sunorted
-        TestCase(
+        ## Interleaved, internally unsorted
+        dict(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
@ -386,9 +382,39 @@ def test_argsort_mm_positions():
                ("image", 1),
            ],
        ),
-    ]
+    ],
+)
+# yapf: enable
+def test_argsort_mm_positions(case):
+    mm_positions = case["mm_positions"]
+    expected_modality_idxs = case["expected_modality_idxs"]

-    for mm_positions, expected_modality_idxs in test_cases:
-        modality_idxs = argsort_mm_positions(mm_positions)
+    modality_idxs = argsort_mm_positions(mm_positions)

-        assert modality_idxs == expected_modality_idxs
+    assert modality_idxs == expected_modality_idxs
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
+async def test_allowed_media_domains(video_url: str, num_frames: int):
+    connector = MediaConnector(
+        media_io_kwargs={"video": {
+            "num_frames": num_frames,
+        }},
+        allowed_media_domains=[
+            "www.bogotobogo.com",
+            "github.com",
+        ])
+
+    video_sync, metadata_sync = connector.fetch_video(video_url)
+    video_async, metadata_async = await connector.fetch_video_async(video_url)
+    assert np.array_equal(video_sync, video_async)
+    assert metadata_sync == metadata_async
+
+    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
+    with pytest.raises(ValueError):
+        _, _ = connector.fetch_video(disallowed_url)
+
+    with pytest.raises(ValueError):
+        _, _ = await connector.fetch_video_async(disallowed_url)
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@ -26,5 +26,5 @@ class DummyPlatform(Platform):

    def get_attn_backend_cls(self, backend_name, head_size, dtype,
                             kv_cache_dtype, block_size, use_v1, use_mla,
-                             has_sink):
+                             has_sink, use_sparse):
        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -18,9 +18,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
-from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    W8A8BlockFp8LinearOp)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@ -745,35 +742,3 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
        perplexity = llm.generate_prompt_perplexity([prompt])[0]
        print(perplexity)
        assert perplexity <= exp_perplexity
-
-
-def test_compressed_tensors_fp8_block_enabled(vllm_runner):
-    model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
-    with vllm_runner(model_path) as llm:
-
-        fp8_dtype = current_platform.fp8_dtype()
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
-            assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear,
-                              W8A8BlockFp8LinearOp)
-
-            assert qkv_proj.weight.dtype is fp8_dtype
-            assert qkv_proj.weight_scale.dtype is torch.float32
-            assert len(qkv_proj.weight.shape) == 2
-            assert len(qkv_proj.weight_scale.shape) == 2
-
-            input_quant_op = \
-                qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
-            assert isinstance(input_quant_op, QuantFP8)
-            assert input_quant_op._forward_method == input_quant_op.forward_cuda
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
-        assert output
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -97,7 +97,6 @@ def test_auto_task(model_id, expected_runner_type, expected_convert_type,

    assert config.runner_type == expected_runner_type
    assert config.convert_type == expected_convert_type
-    assert expected_task in config.supported_tasks


 # Can remove once --task option is fully deprecated
@ -120,7 +119,6 @@ def test_score_task(model_id, expected_runner_type, expected_convert_type,

    assert config.runner_type == expected_runner_type
    assert config.convert_type == expected_convert_type
-    assert expected_task in config.supported_tasks


 # Can remove once --task option is fully deprecated
@ -137,7 +135,6 @@ def test_transcription_task(model_id, expected_runner_type,

    assert config.runner_type == expected_runner_type
    assert config.convert_type == expected_convert_type
-    assert expected_task in config.supported_tasks


@pytest.mark.parametrize(
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@ -96,7 +96,7 @@ def test_routing_strategy_integration(monkeypatch, device):
        envs.environment_variables[env_name] = lambda s=strategy: s

        # Test the select_experts method
-        topk_weights, topk_ids = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=hidden_states,
            router_logits=router_logits,
            top_k=top_k,
--- a/tests/test_test.py
+++ b/tests/test_test.py
@ -1,61 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm import LLM, envs
-from vllm.sampling_params import SamplingParams
-
-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
-
-
-@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
-# TODO TPU will appear busy if we fan-out test params here
-@pytest.mark.parametrize("n_prompts", [1])
-def test_logprobs(model_name: str, n_prompts: int):
-    """
-    Request top logprobs with different sampling settings and check
-    that results contains the requested number, ordered ascendingly.  
-    """
-
-    def check_num_logprobs(logprobs, expected_num: int):
-        for step in logprobs:
-            prev_logp = 1.0
-            # order by rank
-            sorted_step = dict(
-                sorted(step.items(), key=lambda item: item[1].rank))
-
-            if len(step) != expected_num:
-                print("watch out", sorted_step)
-
-            # check results are ordered by prob value
-            # assert len(step) == expected_num
-            for rankno, (tid, logp) in enumerate(sorted_step.items()):
-                assert logp.logprob <= prev_logp
-                prev_logp = logp.logprob
-                assert logp.rank == rankno + 1
-
-    llm = LLM(model_name,
-              enforce_eager=False,
-              max_num_seqs=1,
-              max_model_len=128,
-              max_num_batched_tokens=128)
-    prompts = [
-        "Write a short story about a robot that dreams for the first time."
-    ] * n_prompts
-    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64,\
-         logprobs=4)
-    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
-         logprobs=4)
-    topkp_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
-         logprobs=4, top_k=12, top_p=0.5)
-
-    for sp in [greedy_sampling_params, regular_sampling_params, \
-               topkp_sampling_params]:
-        output = llm.generate(prompts, sp)
-        for o in output:
-            check_num_logprobs(o.outputs[0].logprobs, 4)
--- a/tests/test_triton_utils.py
+++ b/tests/test_triton_utils.py
@ -69,6 +69,8 @@ def test_triton_placeholder_language():
    assert lang.constexpr is None
    assert lang.dtype is None
    assert lang.int64 is None
+    assert lang.int32 is None
+    assert lang.tensor is None


 def test_triton_placeholder_language_from_parent():
--- a/tests/utils.py
+++ b/tests/utils.py
@ -1131,14 +1131,14 @@ def has_module_attribute(module_name, attribute_name):

 def get_attn_backend_list_based_on_platform() -> list[str]:
    if current_platform.is_cuda():
-        return ["FLASH_ATTN_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TREE_ATTN"]
+        return ["FLASH_ATTN", "TRITON_ATTN", "TREE_ATTN"]
    elif current_platform.is_rocm():
-        attn_backend_list = ["TRITON_ATTN_VLLM_V1"]
+        attn_backend_list = ["TRITON_ATTN"]
        try:
            import aiter  # noqa: F401
-            attn_backend_list.append("FLASH_ATTN_VLLM_V1")
+            attn_backend_list.append("FLASH_ATTN")
        except Exception:
-            print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
+            print("Skip FLASH_ATTN on ROCm as aiter is not installed")

        return attn_backend_list
    else:
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@ -23,15 +23,16 @@ from vllm_test_utils.monitor import monitor
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import (
    convert_ids_list_to_tokens)
-from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
-                        MemorySnapshot, PlaceholderModule, StoreBoolean,
-                        bind_kv_cache, common_broadcastable_dtype,
-                        current_stream, deprecate_kwargs, get_open_port,
-                        get_tcp_uri, is_lossless_cast, join_host_port,
-                        make_zmq_path, make_zmq_socket, memory_profiling,
-                        merge_async_iterators, sha256, split_host_port,
-                        split_zmq_path, supports_kw, swap_dict_values)

+# isort: off
+from vllm.utils import (
+    CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot,
+    PlaceholderModule, bind_kv_cache, common_broadcastable_dtype,
+    current_stream, deprecate_kwargs, get_open_port, get_tcp_uri,
+    is_lossless_cast, join_host_port, make_zmq_path, make_zmq_socket,
+    memory_profiling, merge_async_iterators, sha256, split_host_port,
+    split_zmq_path, supports_kw, swap_dict_values, unique_filepath)
+# isort: on
 from ..utils import create_new_process_for_each_test, error_on_warning


@ -1032,3 +1033,15 @@ def test_load_config_file(tmp_path):
    # Assert that the processed arguments match the expected output
    assert processed_args == expected_args
    os.remove(str(config_file_path))
+
+
+def test_unique_filepath():
+    temp_dir = tempfile.mkdtemp()
+    path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
+    paths = set()
+    for i in range(10):
+        path = unique_filepath(path_fn)
+        path.write_text("test")
+        paths.add(path)
+    assert len(paths) == 10
+    assert len(list(Path(temp_dir).glob("*.txt"))) == 10
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@ -21,16 +21,15 @@ from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
 from vllm.v1.kv_cache_interface import FullAttentionSpec

 BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
-    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN,
-    "FLEX_ATTENTION_SLOW"
+    _Backend.FLASH_ATTN, _Backend.FLASHINFER, _Backend.FLEX_ATTENTION,
+    _Backend.TRITON_ATTN, _Backend.TREE_ATTN, "FLEX_ATTENTION_SLOW"
 ]

 # Remove flashinfer from the list if it's not available
 try:
    import flashinfer  # noqa: F401
 except ImportError:
-    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
+    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)


 def _convert_dtype_to_torch(dtype):
@ -214,7 +213,7 @@ def run_attention_backend(
    builder_cls, impl_cls = get_attention_backend(actual_backend)

    # Mock flashinfer's get_per_layer_parameters if needed
-    if actual_backend == _Backend.FLASHINFER_VLLM_V1:
+    if actual_backend == _Backend.FLASHINFER:
        import unittest.mock

        from vllm.v1.attention.backends.utils import PerLayerParameters
@ -434,7 +433,7 @@ def _test_backend_correctness(
        #   [num_blocks, 2, block_size, num_kv_heads, head_size]
        # Select the appropriate KV cache format for each backend
        kv_cache_for_backend = kv_cache
-        if backend_name == _Backend.FLASHINFER_VLLM_V1:
+        if backend_name == _Backend.FLASHINFER:
            kv_cache_for_backend = kv_cache.transpose(0, 1)

            # For FlashInfer default to HND layout and
@ -518,8 +517,8 @@ def test_causal_backend_correctness(batch_spec_name: str, model: str):


 SLIDING_WINDOW_BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION,
-    _Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW"
+    _Backend.FLASH_ATTN, _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN,
+    "FLEX_ATTENTION_SLOW"
 ]


--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for v1 MLA backends without GPUModelRunner dependency."""
+from typing import Optional, Union

 import pytest
 import torch
@ -10,13 +11,14 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
                                      create_standard_kv_cache_spec,
                                      create_vllm_config,
                                      get_attention_backend)
+from vllm import _custom_ops as ops
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec

 BACKENDS_TO_TEST = [
-    _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, _Backend.FLASH_ATTN_MLA,
-    _Backend.TRITON_MLA_VLLM_V1
+    _Backend.CUTLASS_MLA, _Backend.FLASHMLA, _Backend.FLASH_ATTN_MLA,
+    _Backend.TRITON_MLA
 ]

 # Remove CUTLASS_MLA from the list if not using sm100
@ -78,7 +80,9 @@ def create_and_prepopulate_kv_cache(
        device: torch.device,
        num_blocks: int,
        common_attn_metadata: CommonAttentionMetadata,
-        randomize_blocks: bool = True) -> torch.Tensor:
+        randomize_blocks: bool = True,
+        kv_cache_dtype: Optional[str] = None,
+        scale: Union[float, torch.Tensor] = 1.0) -> torch.Tensor:
    """Create and prepopulate an MLA KV cache with context data.
    
    Args:
@ -93,6 +97,11 @@ def create_and_prepopulate_kv_cache(
        common_attn_metadata: Common attention metadata
        randomize_blocks: Whether to randomly permute blocks 
                          or use sequential order
+        kv_cache_dtype: Optional kv cache dtype string. When set to
+                        "fp8_ds_mla" the cache is populated using the
+                        fp8 DeepSeek MLA layout via concat_and_cache_mla.
+        scale: Scaling factor forwarded to concat_and_cache_mla when the
+               fp8 cache layout is requested.
        
    Returns:
        MLA KV cache tensor
@ -105,23 +114,61 @@ def create_and_prepopulate_kv_cache(
    block_table = common_attn_metadata.block_table_tensor
    slot_mapping = common_attn_metadata.slot_mapping

-    # Create MLA KV cache: (num_blocks, block_size, head_size)
-    kv_cache = torch.empty(num_blocks,
-                           block_size,
-                           head_size,
-                           dtype=dtype,
-                           device=device)
-    kv_cache_flat = kv_cache.view(-1, head_size)
+    use_fp8_ds_mla = kv_cache_dtype == "fp8_ds_mla"
+
+    if use_fp8_ds_mla:
+        if not kv_c_contexts:
+            raise ValueError("kv_c_contexts cannot be empty when using"
+                             " fp8_ds_mla cache dtype")
+        kv_lora_rank = kv_c_contexts[0].shape[-1]
+        rope_dim = k_pe_contexts[0].shape[-1]
+        entry_size = kv_lora_rank + 4 * 4 + 2 * rope_dim
+        kv_cache = torch.zeros(num_blocks,
+                               block_size,
+                               entry_size,
+                               dtype=torch.uint8,
+                               device=device)
+        scale_tensor = (scale
+                        if isinstance(scale, torch.Tensor) else torch.tensor(
+                            scale, dtype=torch.float32, device=device))
+        scale_tensor = scale_tensor.to(device=device, dtype=torch.float32)
+    else:
+        # Create MLA KV cache: (num_blocks, block_size, head_size)
+        kv_cache = torch.empty(num_blocks,
+                               block_size,
+                               head_size,
+                               dtype=dtype,
+                               device=device)
+        kv_cache_flat = kv_cache.view(-1, head_size)

    # Populate the cache with the context tokens
    # Start from block_id=1 since block_id=0 is considered the null block
    start_block_idx = 1
    for i in range(batch_size):
        kv_c_context, k_pe_context = kv_c_contexts[i], k_pe_contexts[i]
-        kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1)
+        context_len = kv_c_context.shape[0]
+        if context_len == 0:
+            start_block_idx += cdiv(int(seq_lens[i]), block_size)
+            continue
+
        start = start_block_idx * block_size
-        end = start + kv_context.shape[0]
-        kv_cache_flat[start:end, ...] = kv_context
+
+        if use_fp8_ds_mla:
+            slots = torch.arange(context_len, device=device,
+                                 dtype=torch.long) + start
+            ops.concat_and_cache_mla(
+                kv_c_context,
+                k_pe_context.squeeze(1),
+                kv_cache,
+                slots,
+                kv_cache_dtype="fp8_ds_mla",
+                scale=scale_tensor,
+            )
+        else:
+            kv_context = torch.cat(
+                [kv_c_context, k_pe_context.squeeze(1)], dim=-1)
+            end = start + kv_context.shape[0]
+            kv_cache_flat[start:end, ...] = kv_context

        # Stay block aligned and allocate enough blocks for the new tokens
        start_block_idx += cdiv(int(seq_lens[i]), block_size)
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@ -0,0 +1,426 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for the FlashMLA sparse backend utilities."""
+
+import math
+from types import MethodType, SimpleNamespace
+
+import numpy as np
+import pytest
+import torch
+
+from tests.v1.attention.test_mla_backends import (
+    BATCH_SPECS, BatchSpec, MockAttentionLayer,
+    create_and_prepopulate_kv_cache)
+from tests.v1.attention.utils import (create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      create_vllm_config)
+from vllm import _custom_ops as ops
+from vllm.attention.ops import flashmla
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.utils import cdiv
+from vllm.v1.attention.backends.mla.flashmla_sparse import (
+    FlashMLASparseBackend, FlashMLASparseDecodeAndContextMetadata,
+    FlashMLASparseImpl, FlashMLASparseMetadata)
+
+SPARSE_BACKEND_BATCH_SPECS = {
+    name: BATCH_SPECS[name]
+    for name in [
+        "mixed_small",
+        "mixed_medium",
+        "small_prefill",
+        "medium_prefill",
+        "single_prefill",
+    ]
+}
+
+SPARSE_BACKEND_BATCH_SPECS["large_q_prefill"] = BatchSpec(seq_lens=[1024] * 2,
+                                                          query_lens=[256] * 2)
+SPARSE_BACKEND_BATCH_SPECS["large_q_pure_prefill"] = BatchSpec(
+    seq_lens=[256] * 2, query_lens=[256] * 2)
+
+
+def _dequantize_fp8_ds_mla_entry(
+        cache_slice: torch.Tensor, kv_lora_rank: int, rope_dim: int,
+        dtype: torch.dtype) -> tuple[torch.Tensor, torch.Tensor]:
+    """Dequantize a single fp8_ds_mla cache entry back to latent + rope."""
+
+    # The first kv_lora_rank bytes store FP8 latent values with one scale per
+    # 128 element tile written as float32 right after the latent payload.
+    scales = cache_slice.view(torch.float32)[kv_lora_rank //
+                                             4:kv_lora_rank // 4 + 4]
+    latent = torch.empty(kv_lora_rank,
+                         dtype=torch.float16,
+                         device=cache_slice.device)
+    for tile_idx in range(4):
+        tile_start = tile_idx * 128
+        tile_end = tile_start + 128
+        ops.convert_fp8(latent[tile_start:tile_end],
+                        cache_slice[tile_start:tile_end],
+                        float(scales[tile_idx].item()),
+                        kv_dtype="fp8")
+    latent = latent.to(dtype)
+
+    rope_offset = kv_lora_rank // 2 + 8
+    rope_vals = cache_slice.view(dtype)[rope_offset:rope_offset + rope_dim]
+    return latent, rope_vals.clone()
+
+
+def _quantize_dequantize_fp8_ds_mla(
+        kv_c: torch.Tensor, k_pe: torch.Tensor, block_size: int,
+        scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Round-trip kv_c/k_pe though the fp8_ds_mla cache layout."""
+
+    if kv_c.numel() == 0:
+        return kv_c.clone(), k_pe.clone()
+
+    kv_lora_rank = kv_c.shape[-1]
+    rope_dim = k_pe.shape[-1]
+    num_tokens = kv_c.shape[0]
+    num_blocks = max(1, math.ceil(num_tokens / block_size))
+    entry_size = kv_lora_rank + 4 * 4 + 2 * rope_dim
+
+    tmp_cache = torch.zeros(num_blocks,
+                            block_size,
+                            entry_size,
+                            dtype=torch.uint8,
+                            device=kv_c.device)
+    slot_mapping = torch.arange(num_tokens,
+                                dtype=torch.long,
+                                device=kv_c.device)
+
+    ops.concat_and_cache_mla(kv_c,
+                             k_pe,
+                             tmp_cache,
+                             slot_mapping,
+                             kv_cache_dtype="fp8_ds_mla",
+                             scale=scale)
+
+    dequant_kv_c = torch.empty_like(kv_c)
+    dequant_k_pe = torch.empty_like(k_pe)
+
+    for token_idx in range(num_tokens):
+        slot = slot_mapping[token_idx].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        cache_slice = tmp_cache[block_idx, block_offset]
+        latent, rope_vals = _dequantize_fp8_ds_mla_entry(
+            cache_slice, kv_lora_rank, rope_dim, kv_c.dtype)
+        dequant_kv_c[token_idx] = latent
+        dequant_k_pe[token_idx] = rope_vals
+
+    return dequant_kv_c, dequant_k_pe
+
+
+def test_sparse_backend_metadata_registration():
+    backend = FlashMLASparseBackend
+
+    assert backend.get_name() == "FLASHMLA_SPARSE_VLLM_V1"
+    assert backend.get_metadata_cls() is FlashMLASparseMetadata
+    assert backend.get_impl_cls() is FlashMLASparseImpl
+
+    dtype_list = backend.get_supported_dtypes()
+    assert torch.bfloat16 in dtype_list
+
+    shape = backend.get_kv_cache_shape(num_blocks=2,
+                                       block_size=64,
+                                       num_kv_heads=1,
+                                       head_size=576)
+    assert shape == (2, 64, 576)
+
+
+def test_sparse_decode_metadata_filters_prefill_indices():
+    prefill_context_lengths = torch.tensor([4, 2], dtype=torch.int32)
+    metadata = FlashMLASparseDecodeAndContextMetadata(
+        scheduler_metadata=torch.tensor([[0]], dtype=torch.int32),
+        num_splits=torch.tensor([1, 1], dtype=torch.int32),
+        cache_lens=torch.tensor([10, 12], dtype=torch.int32),
+        prefill_context_lengths=prefill_context_lengths,
+    )
+
+    indices = torch.tensor([[0, 3, 5], [1, 2, 4]], dtype=torch.int32)
+
+    context_indices, new_token_indices = metadata.filter_prefill_indices(
+        indices)
+
+    expected_context = torch.tensor([[-1, -1, 5], [-1, -1, 4]],
+                                    dtype=torch.int32)
+    expected_new_tokens = torch.tensor([[-1, -1, 1], [-1, 0, 2]],
+                                       dtype=torch.int32)
+
+    assert torch.equal(context_indices, expected_context)
+    assert torch.equal(new_token_indices, expected_new_tokens)
+
+
+def test_sparse_impl_zero_fills_when_metadata_missing():
+    impl = FlashMLASparseImpl.__new__(FlashMLASparseImpl)
+    dummy_layer = object()
+    q = torch.zeros((2, 1, 3))
+    k_c = torch.zeros((2, 3))
+    k_pe = torch.zeros((2, 1, 1))
+    kv_cache = torch.zeros((1, 1, 1))
+    output = torch.ones((2, 4))
+
+    result = FlashMLASparseImpl.forward(impl,
+                                        dummy_layer,
+                                        q,
+                                        k_c,
+                                        k_pe,
+                                        kv_cache,
+                                        attn_metadata=None,
+                                        output=output)
+
+    assert result is output
+    assert torch.all(result == 0)
+
+
+@pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys()))
+@pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"])
+def test_sparse_backend_decode_correctness(dist_init, batch_name,
+                                           kv_cache_dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is required for sparse MLA decode test")
+
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    batch_spec = SPARSE_BACKEND_BATCH_SPECS[batch_name]
+
+    # Model hyper-parameters (kept intentionally small for the unit test)
+    num_heads = 128
+    kv_lora_rank = 512
+    qk_nope_head_dim = 128
+    qk_rope_head_dim = 64
+    v_head_dim = 128
+    head_size = kv_lora_rank + qk_rope_head_dim
+    topk_tokens = 2048
+
+    max_seqlen = max(batch_spec.seq_lens)
+    total_cache_tokens = sum(batch_spec.seq_lens)
+    block_size = 64
+
+    vllm_config = create_vllm_config(
+        model_name="deepseek-ai/DeepSeek-V2-Lite-Chat",
+        max_model_len=max_seqlen,
+        num_gpu_blocks=max(2048,
+                           cdiv(total_cache_tokens, block_size) + 1),
+        block_size=block_size)
+    model_config = vllm_config.model_config
+    model_config.hf_config = SimpleNamespace(
+        attn_module_list_cfg=[{
+            "topk_tokens": topk_tokens
+        }])
+    model_config.hf_text_config = SimpleNamespace(
+        q_lora_rank=None,
+        kv_lora_rank=kv_lora_rank,
+        qk_nope_head_dim=qk_nope_head_dim,
+        qk_rope_head_dim=qk_rope_head_dim,
+        v_head_dim=v_head_dim,
+        model_type="deepseek_v2",
+    )
+    model_config.dtype = dtype
+    model_config.get_num_attention_heads = MethodType(
+        lambda self, parallel_config: num_heads, model_config)
+    model_config.get_num_kv_heads = MethodType(lambda self, parallel_config: 1,
+                                               model_config)
+    model_config.get_head_size = MethodType(lambda self: head_size,
+                                            model_config)
+    model_config.get_sliding_window = MethodType(lambda self: None,
+                                                 model_config)
+
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+
+    torch.manual_seed(0)
+
+    scale = 1.0 / math.sqrt(head_size)
+
+    # Shared MLA projection weights to keep reference and backend in sync
+    W_UK = torch.randn(kv_lora_rank,
+                       num_heads,
+                       qk_nope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    W_UV = torch.randn(kv_lora_rank,
+                       num_heads,
+                       v_head_dim,
+                       dtype=dtype,
+                       device=device)
+
+    # Build synthetic decode-only workload
+    seq_lens = batch_spec.seq_lens
+    query_lens = batch_spec.query_lens
+
+    all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], []
+    kv_c_contexts, k_pe_contexts = [], []
+    reference_outputs = []
+
+    kv_cache_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+    for i in range(batch_spec.batch_size):
+        s_len = seq_lens[i]
+        q_len = query_lens[i]
+        ctx_len = s_len - q_len
+
+        q_c = torch.rand(q_len,
+                         num_heads,
+                         qk_nope_head_dim + qk_rope_head_dim,
+                         dtype=dtype,
+                         device=device)
+        kv_c_full = torch.rand(s_len, kv_lora_rank, dtype=dtype, device=device)
+        k_pe_full = torch.rand(s_len,
+                               1,
+                               qk_rope_head_dim,
+                               dtype=dtype,
+                               device=device)
+
+        kv_c_full, k_pe_full = _quantize_dequantize_fp8_ds_mla(
+            kv_c_full,
+            k_pe_full.squeeze(1),
+            block_size=vllm_config.cache_config.block_size,
+            scale=kv_cache_scale,
+        )
+
+        q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
+        ql_nope = torch.einsum("qnh,lnh->qnl", q_nope, W_UK)
+        q_mqa = torch.cat([ql_nope, q_pe], dim=-1)
+
+        k_mqa = torch.cat([kv_c_full, k_pe_full], dim=-1)
+        k_mqa = k_mqa.unsqueeze(1).expand(-1, num_heads, -1)
+        v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_heads, -1)
+
+        attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device)
+        causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
+        attn_mask[:, ctx_len:] = causal_mask
+
+        q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2)
+        k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2)
+        v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
+
+        sdpa_out = torch.nn.functional.scaled_dot_product_attention(
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+        sdpa_out = sdpa_out.transpose(1, 2).squeeze(0)
+
+        sdpa_out = torch.einsum("qnl,lnv->qnv", sdpa_out, W_UV)
+        reference_outputs.append(sdpa_out.flatten(start_dim=-2))
+
+        all_q_vllm.append(q_c)
+        all_kv_c_vllm.append(kv_c_full[ctx_len:])
+        all_k_pe_vllm.append(k_pe_full[ctx_len:])
+        kv_c_contexts.append(kv_c_full[:ctx_len + 1])
+        k_pe_contexts.append(k_pe_full[:ctx_len + 1])
+
+    query_vllm = torch.cat(all_q_vllm, dim=0)
+    kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0)
+    k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0)
+    sdpa_reference = torch.cat(reference_outputs, dim=0)
+
+    vllm_config.cache_config.cache_dtype = kv_cache_dtype
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        vllm_config.cache_config.block_size,
+        device,
+        arange_block_indices=True)
+
+    kv_cache = create_and_prepopulate_kv_cache(
+        kv_c_contexts=kv_c_contexts,
+        k_pe_contexts=k_pe_contexts,
+        block_size=vllm_config.cache_config.block_size,
+        head_size=head_size,
+        dtype=dtype,
+        device=device,
+        num_blocks=vllm_config.cache_config.num_gpu_blocks,
+        common_attn_metadata=common_attn_metadata,
+        randomize_blocks=False,
+        kv_cache_dtype=vllm_config.cache_config.cache_dtype,
+        scale=kv_cache_scale,
+    )
+
+    builder_cls = FlashMLASparseBackend.get_builder_cls()
+    builder = builder_cls(kv_cache_spec, ["placeholder"], vllm_config, device)
+    metadata = builder.build(common_prefix_len=0,
+                             common_attn_metadata=common_attn_metadata)
+
+    starts = np.asarray(common_attn_metadata.query_start_loc_cpu,
+                        dtype=np.int32)
+    seg_lengths = np.diff(starts)
+    positions = np.arange(starts[-1], dtype=np.int32) - np.repeat(
+        starts[:-1], seg_lengths)
+    seq_lengths = np.asarray(common_attn_metadata.seq_lens_cpu, dtype=np.int32)
+    prefix_lengths = seq_lengths - seg_lengths
+    positions += np.repeat(prefix_lengths, seg_lengths)
+
+    pos_gpu = torch.as_tensor(positions, device=device, dtype=torch.int32)
+    topk = metadata.topk_tokens
+    debug_indices = torch.arange(topk, device=device,
+                                 dtype=torch.int32).unsqueeze(0)
+    token_positions = pos_gpu.unsqueeze(1)
+    causal_mask = (debug_indices <= token_positions)
+    debug_indices = torch.where(causal_mask, debug_indices,
+                                torch.full_like(debug_indices, -1))
+
+    # FlashMLASparseImpl now reads top-k indices from the indexer-provided
+    # buffer, so emulate that contract with a simple namespace mock.
+    debug_indices = debug_indices.expand(metadata.num_actual_tokens,
+                                         -1).clone()
+    mock_indexer = SimpleNamespace(topk_indices_buffer=debug_indices)
+
+    ok, reason = flashmla.is_flashmla_supported()
+    if not ok:
+        pytest.skip(reason)
+
+    kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1)
+    kv_b_proj_weight = kv_b_proj_weight.view(
+        kv_lora_rank, num_heads * (qk_nope_head_dim + v_head_dim))
+
+    mock_kv_b_proj = ColumnParallelLinear(input_size=kv_lora_rank,
+                                          output_size=num_heads *
+                                          (qk_nope_head_dim + v_head_dim),
+                                          bias=False).to(device=device,
+                                                         dtype=dtype)
+    mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T.contiguous())
+
+    impl_cls = FlashMLASparseBackend.get_impl_cls()
+    impl = impl_cls(num_heads=num_heads,
+                    head_size=head_size,
+                    scale=scale,
+                    num_kv_heads=1,
+                    alibi_slopes=None,
+                    sliding_window=None,
+                    kv_cache_dtype=vllm_config.cache_config.cache_dtype,
+                    logits_soft_cap=None,
+                    attn_type="decoder",
+                    kv_sharing_target_layer_name=None,
+                    q_lora_rank=None,
+                    kv_lora_rank=kv_lora_rank,
+                    qk_nope_head_dim=qk_nope_head_dim,
+                    qk_rope_head_dim=qk_rope_head_dim,
+                    qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
+                    v_head_dim=v_head_dim,
+                    kv_b_proj=mock_kv_b_proj,
+                    indexer=mock_indexer)
+
+    impl.process_weights_after_loading(dtype)
+
+    layer = MockAttentionLayer(device)
+    out_buffer = torch.empty(metadata.num_actual_tokens,
+                             num_heads * v_head_dim,
+                             dtype=dtype,
+                             device=device)
+
+    backend_output = impl.forward(layer,
+                                  query_vllm,
+                                  kv_c_vllm,
+                                  k_pe_vllm,
+                                  kv_cache,
+                                  metadata,
+                                  output=out_buffer)
+
+    assert backend_output.shape == sdpa_reference.shape
+    assert backend_output.dtype == sdpa_reference.dtype
+    assert torch.isfinite(backend_output).all()
+
+    torch.testing.assert_close(backend_output,
+                               sdpa_reference,
+                               rtol=0.5,
+                               atol=0.5)
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -120,30 +120,30 @@ def get_attention_backend(backend_name: _Backend):
        Tuple of (backend_builder_class, backend_impl_class)
    """
    backend_map = {
-        _Backend.FLASH_ATTN_VLLM_V1:
+        _Backend.FLASH_ATTN:
        ("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
         if current_platform.is_cuda() else
         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
         ),
-        _Backend.FLASHINFER_VLLM_V1:
+        _Backend.FLASHINFER:
        "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
        _Backend.FLEX_ATTENTION:
        "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
-        _Backend.TRITON_ATTN_VLLM_V1:
+        _Backend.TRITON_ATTN:
        "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
        _Backend.TREE_ATTN:
        "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
-        _Backend.XFORMERS_VLLM_V1:
+        _Backend.XFORMERS:
        "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
        _Backend.CUTLASS_MLA:
        "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
-        _Backend.FLASHMLA_VLLM_V1:
+        _Backend.FLASHMLA:
        "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
        _Backend.FLASH_ATTN_MLA:
        "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
        _Backend.FLASHINFER_MLA:
        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
-        _Backend.TRITON_MLA_VLLM_V1:
+        _Backend.TRITON_MLA:
        "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
    }

@ -168,7 +168,6 @@ def create_standard_kv_cache_spec(
            vllm_config.parallel_config),
        head_size=vllm_config.model_config.get_head_size(),
        dtype=vllm_config.model_config.dtype,
-        use_mla=vllm_config.model_config.use_mla,
        sliding_window=vllm_config.model_config.get_sliding_window(),
    )

--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -24,7 +24,8 @@ from vllm.v1.core.kv_cache_utils import (
    make_block_hash_with_group_id)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                        KVCacheGroupSpec, KVCacheSpec,
-                                        KVCacheTensor, SlidingWindowSpec,
+                                        KVCacheTensor, MLAAttentionSpec,
+                                        SlidingWindowSpec,
                                        UniformTypeKVCacheSpecs)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
@ -77,13 +78,11 @@ def new_kv_cache_spec(block_size=16,
                      num_kv_heads=2,
                      head_size=64,
                      dtype=torch.float32,
-                      use_mla=False,
                      sliding_window=None):
    return FullAttentionSpec(block_size=block_size,
                             num_kv_heads=num_kv_heads,
                             head_size=head_size,
                             dtype=dtype,
-                             use_mla=use_mla,
                             sliding_window=sliding_window)


@ -91,13 +90,11 @@ def new_sliding_window_spec(block_size=16,
                            num_kv_heads=2,
                            head_size=64,
                            dtype=torch.float32,
-                            use_mla=False,
                            sliding_window=1):
    return SlidingWindowSpec(block_size=block_size,
                             num_kv_heads=num_kv_heads,
                             head_size=head_size,
                             dtype=dtype,
-                             use_mla=use_mla,
                             sliding_window=sliding_window)


@ -894,7 +891,6 @@ def test_merge_kv_cache_spec():
            num_kv_heads=full_spec.num_kv_heads,
            head_size=full_spec.head_size,
            dtype=full_spec.dtype,
-            use_mla=full_spec.use_mla,
            sliding_window=1,
        ),
    ]
@ -991,7 +987,6 @@ def test_estimate_max_model_len(model_id, max_model_len,
            num_kv_heads=32,
            head_size=128,
            dtype=torch.float16,
-            use_mla=False,
        )
    # Estimate the maximum model length, 16384 model_len need 8GB
    estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
@ -1022,7 +1017,6 @@ def test_get_max_concurrency_for_kv_cache_config():
        num_kv_heads=32,
        head_size=128,
        dtype=torch.float16,
-        use_mla=False,
    )

    sliding_window_spec = SlidingWindowSpec(
@ -1030,7 +1024,6 @@ def test_get_max_concurrency_for_kv_cache_config():
        num_kv_heads=32,
        head_size=128,
        dtype=torch.float16,
-        use_mla=False,
        sliding_window=1024,
    )

@ -1412,3 +1405,48 @@ def test_generate_scheduler_kv_cache_config():
            KVCacheGroupSpec(['layer_1', 'layer_2'], new_kv_cache_spec())
        ],
    )
+
+
+def new_mla_spec(cache_dtype_str=None):
+    return MLAAttentionSpec(block_size=16,
+                            num_kv_heads=16,
+                            head_size=64,
+                            dtype=torch.float32,
+                            cache_dtype_str=cache_dtype_str)
+
+
+def test_merge_mla_spec():
+    kv_cache_specs = [
+        new_mla_spec(),
+        new_mla_spec(),
+    ]
+    mla_spec = kv_cache_specs[0].merge(kv_cache_specs)
+    assert mla_spec == new_mla_spec()
+
+    kv_cache_specs = [
+        new_mla_spec(cache_dtype_str="fp8_ds_mla"),
+        new_mla_spec(cache_dtype_str="fp8_ds_mla"),
+    ]
+    mla_spec = kv_cache_specs[0].merge(kv_cache_specs)
+    assert mla_spec == new_mla_spec(cache_dtype_str="fp8_ds_mla")
+
+    kv_cache_specs = [
+        new_mla_spec(cache_dtype_str="fp8_ds_mla"),
+        new_mla_spec(cache_dtype_str=None),
+    ]
+    with pytest.raises(AssertionError):
+        kv_cache_specs[0].merge(kv_cache_specs)
+
+    kv_cache_specs = [
+        new_kv_cache_spec(),
+        new_mla_spec(),
+    ]
+    with pytest.raises(AssertionError):
+        kv_cache_specs[0].merge(kv_cache_specs)
+
+    kv_cache_specs = [
+        new_mla_spec(cache_dtype_str="fp8_ds_mla"),
+        new_kv_cache_spec(),
+    ]
+    with pytest.raises(AssertionError):
+        kv_cache_specs[0].merge(kv_cache_specs)
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@ -76,7 +76,7 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
        kv_cache_groups=[
            KVCacheGroupSpec(
                ["layer"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+                FullAttentionSpec(block_size, 1, 1, torch.float32),
            )
        ],
    )
@ -90,7 +90,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
        kv_cache_groups=[
            KVCacheGroupSpec(
                ["layer1"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+                FullAttentionSpec(block_size, 1, 1, torch.float32),
            ),
            KVCacheGroupSpec(
                ["layer2"],
@ -98,7 +98,6 @@ def make_kv_cache_config_hybrid_model(block_size: int,
                                  1,
                                  1,
                                  torch.float32,
-                                  False,
                                  sliding_window=2 * block_size),
            ),
            KVCacheGroupSpec(
@ -107,7 +106,6 @@ def make_kv_cache_config_hybrid_model(block_size: int,
                                  1,
                                  1,
                                  torch.float32,
-                                  False,
                                  sliding_window=2 * block_size),
            ),
        ],
@ -1338,7 +1336,6 @@ def test_eagle_with_sliding_window():
        head_size=1,
        dtype=torch.float32,
        sliding_window=block_size,
-        use_mla=False,
    )
    manager = KVCacheManager(
        KVCacheConfig(
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@ -35,7 +35,6 @@ def test_chunked_local_attention_possible_cached_prefix():
        head_size=1,
        dtype=torch.float32,
        attention_chunk_size=4,
-        use_mla=False,
    )

    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
@ -100,7 +99,6 @@ def test_sliding_window_possible_cached_prefix():
        head_size=1,
        dtype=torch.float32,
        sliding_window=4,
-        use_mla=False,
    )

    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
@ -165,7 +163,6 @@ def test_chunked_local_attention_remove_skipped_blocks():
        head_size=1,
        dtype=torch.float32,
        attention_chunk_size=4,
-        use_mla=False,
    )

    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
@ -217,7 +214,6 @@ def test_sliding_window_remove_skipped_blocks():
        head_size=1,
        dtype=torch.float32,
        sliding_window=4,
-        use_mla=False,
    )

    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
@ -285,7 +281,6 @@ def test_get_num_blocks_to_allocate():
        head_size=1,
        dtype=torch.float32,
        sliding_window=4,  # Placeholder value, not related to test result
-        use_mla=False,
    )

    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
@ -308,7 +303,6 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
        head_size=1,
        dtype=torch.float32,
        attention_chunk_size=4,  # Placeholder value, not related to test result
-        use_mla=False,
    )

    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@ -89,7 +89,7 @@ backend_configs = {
    # Triton Attention
    "TritonAttn":
    BackendConfig(name="TritonAttn",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
+                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
                  comp_config={
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  }),
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@ -9,11 +9,14 @@ from ...utils import create_new_process_for_each_test


@create_new_process_for_each_test()
-@pytest.mark.parametrize("attn_backend",
-                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
 def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"

+    if attn_backend == "FLASHINFER":
+        pytest.skip("This test is failing with FlashInfer backend and "
+                    "needs investigation. See issue #25679.")
+
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@ -15,6 +15,8 @@ from vllm.assets.image import VLM_IMAGES_DIR
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

+MTP_SIMILARITY_RATE = 0.8
+

 def get_test_prompts(mm_enabled: bool):
    prompt_types = ["repeat", "sentence"]
@ -176,12 +178,11 @@ def test_eagle_correctness(
        m.setenv("VLLM_MLA_DISABLE", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

-        if (attn_backend == "TRITON_ATTN_VLLM_V1"
-                and not current_platform.is_rocm()):
-            pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+        if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
+            pytest.skip("TRITON_ATTN does not support "
                        "multi-token eagle spec decode on current platform")

-        if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
            m.setenv("VLLM_ROCM_USE_AITER", "1")

        method, model_name, spec_model_name, tp_size = model_setup
@ -223,3 +224,66 @@ def test_eagle_correctness(
        del spec_llm
        torch.cuda.empty_cache()
        cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+    (("mtp", "XiaomiMiMo/MiMo-7B-Base", 1), False),
+    (("mtp", "ZixiQi/DeepSeek-V3-4layers-MTP-FP8", 1), False),
+],
+                         ids=["mimo", "deepseek"])
+def test_mtp_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_setup: tuple[str, str, int],
+    mm_enabled: bool,
+):
+    # Generate test prompts inside the function instead of using fixture
+    test_prompts = get_test_prompts(mm_enabled)
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using MTP speculative decoding.
+    model_setup: (method, model_name, tp_size)
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_MLA_DISABLE", "1")
+
+        method, model_name, tp_size = model_setup
+
+        ref_llm = LLM(model=model_name,
+                      max_model_len=2048,
+                      tensor_parallel_size=tp_size,
+                      trust_remote_code=True)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            speculative_config={
+                "method": method,
+                "num_speculative_tokens": 1,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 80% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(MTP_SIMILARITY_RATE * len(ref_outputs))
+        del spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -836,8 +836,7 @@ def test_engine_core_proc_instantiation_cuda_empty(
        mock_spec = FullAttentionSpec(block_size=16,
                                      num_kv_heads=1,
                                      head_size=64,
-                                      dtype=torch.float16,
-                                      use_mla=False)
+                                      dtype=torch.float16)

        mock_executor.get_kv_cache_specs.return_value = [{
            "default": mock_spec
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@ -12,6 +12,7 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
                                   STOP_STRINGS,
                                   DummyOutputProcessorTestVectors,
                                   MockEngineCore)
+from vllm import PoolingParams
 from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
@ -998,3 +999,35 @@ async def test_cumulative_output_collector_n():
    third = [k for k in result.outputs if k.index == 2]
    assert len(third) == 1
    assert third[0].text == "c"
+
+
+@pytest.mark.parametrize("runner", ["generate", "pooling"])
+def test_abort_requests(runner: str, dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
+                                       log_stats=True)
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt_token_ids=prompt_tokens,
+            mm_features=None,
+            eos_token_id=None,
+            arrival_time=0,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams() if runner == "generate" else None,
+            pooling_params=PoolingParams(
+                task="embed") if runner == "pooling" else None,
+        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+    ]
+
+    for request in requests:
+        if runner == "generate":
+            output_kind = request.sampling_params.output_kind
+        else:
+            output_kind = request.pooling_params.output_kind
+        queue = RequestOutputCollector(output_kind=output_kind)
+        output_processor.add_request(request, None, queue=queue)
+
+    for request in requests:
+        output_processor.abort_requests([request.request_id])
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@ -26,6 +26,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
    KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
    NixlConnectorWorker, NixlKVConnectorStats)
+from vllm.distributed.kv_transfer.kv_transfer_state import (
+    ensure_kv_transfer_shutdown, has_kv_transfer_group)
 from vllm.forward_context import ForwardContext
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
@ -35,6 +37,26 @@ from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from .utils import create_request, create_scheduler, create_vllm_config


+@pytest.fixture(scope="module", autouse=True)
+def clear_kv_transfer():
+    """
+    The test cases in this file use `VLLM_ENABLE_V1_MULTIPROCESSING=0`,
+    causing the global variable `_KV_CONNECTOR_AGENT` 
+    to be assigned but never deleted.
+
+    Since the current pytest process does not terminate and instead 
+    continues running tests from other files,
+    this global variable remains in memory and interferes 
+    with test cases in other modules.
+    
+    So we use this fixture to ensure that the global variable
+    `_KV_CONNECTOR_AGENT` is properly cleaned up after each test.
+    """
+    yield
+    if has_kv_transfer_group():
+        ensure_kv_transfer_shutdown()
+
+
 class FakeNixlWrapper:
    """Mock implementation of NixlWrapper for testing.

@ -233,8 +255,9 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
        time.sleep(self._hand_shake_latency)
        # These should've been done in register_kv_caches(), called by
        # gpu_model_runner. Here we just hardcode some dummy values.
-        self.slot_size_bytes = 4096
-        self.block_len = self.slot_size_bytes * self.block_size
+        slot_size_bytes = 4096
+        self.slot_size_per_layer = [slot_size_bytes]
+        self.block_len_per_layer = [slot_size_bytes * self.block_size]
        self.num_blocks = 1
        self.dst_num_blocks[self.engine_id] = self.num_blocks

@ -246,7 +269,7 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
                kv_caches_base_addr=[0],
                num_blocks=1,
-                block_len=self.block_len,
+                block_lens=self.block_len_per_layer,
                attn_backend_name=self.backend_name,
                # `self.kv_cache_layout` is only forced to HND when vllm engine
                # is started. We mock HND here.
@ -463,8 +486,8 @@ class TestNixlHandshake:
            worker = connector.connector_worker

            # Minimal local registration params used by add_remote_agent
-            worker.slot_size_bytes = 4096
-            worker.block_len = worker.slot_size_bytes * worker.block_size
+            worker.slot_size_per_layer = [4096]
+            worker.block_len_per_layer = [4096 * worker.block_size]
            worker.num_blocks = 1
            worker.dst_num_blocks[worker.engine_id] = worker.num_blocks

@ -476,7 +499,7 @@ class TestNixlHandshake:
                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
                kv_caches_base_addr=[0],
                num_blocks=1,
-                block_len=worker.block_len,
+                block_lens=worker.block_len_per_layer,
                attn_backend_name=worker.backend_name,
                kv_cache_layout=mismatched_layout,
            )
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@ -8,7 +8,8 @@ import ray
 from vllm.config import ModelDType
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
-from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
+from vllm.v1.metrics.ray_wrappers import (RayPrometheusMetric,
+                                          RayPrometheusStatLogger)


@pytest.fixture(scope="function", autouse=True)
@ -65,3 +66,39 @@ def test_engine_log_metrics_ray(
    # Create the actor and call the async method
    actor = EngineTestActor.remote()  # type: ignore[attr-defined]
    ray.get(actor.run.remote())
+
+
+def test_sanitized_opentelemetry_name():
+    """Test the metric name sanitization logic for Ray."""
+
+    # Only a-z, A-Z, 0-9, _, test valid characters are preserved
+    valid_name = "valid_metric_123_abcDEF"
+    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
+        valid_name) == valid_name
+
+    # Test dash, dot, are replaced
+    name_with_dash_dot = "metric-name.test"
+    expected = "metric_name_test"
+    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
+        name_with_dash_dot) == expected
+
+    # Test colon is replaced with underscore
+    name_with_colon = "metric:name"
+    expected = "metric_name"
+    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
+        name_with_colon) == expected
+
+    # Test multiple invalid characters are replaced
+    name_with_invalid = "metric:name@with#special%chars"
+    expected = "metric_name_with_special_chars"
+    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
+        name_with_invalid) == expected
+
+    # Test mixed valid and invalid characters
+    complex_name = "vllm:engine_stats/time.latency_ms-99p"
+    expected = "vllm_engine_stats_time_latency_ms_99p"
+    assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
+        complex_name) == expected
+
+    # Test empty string
+    assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == ""
--- a/Show More
+++ b/Show More