revert skip-merge-desc

Signed-off-by: Robert Shaw <robshaw@redhat.com>
updated
2025-10-20 23:03:52 +08:00 · 2025-07-03 20:30:45 +00:00 · 2025-07-03 20:29:33 +00:00 · 2025-07-03 18:29:58 +00:00
183 changed files with 1943 additions and 4110 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -52,7 +52,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Annotate release workflow"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -107,9 +107,10 @@ fi

 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
-  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/stest_attention_selector.py \
  --ignore=kernels/attention/test_blocksparse_attention.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth @LucasWilkinson
+CMakeLists.txt @tlrmchlsmth

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -68,7 +68,7 @@ jobs:
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"

      - name: curl test
        run: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -259,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -615,26 +615,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "in CUDA target architectures.")
    endif()
  endif()
-  
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
-    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
-                     "if you intend on running FP8 quantized MoE models on Blackwell.")
-    else()
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()

  #
  # Machete kernels
--- a/benchmarks/benchmark_one_concurrent.py
+++ b/benchmarks/benchmark_one_concurrent.py
@ -0,0 +1,362 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import asyncio
+import logging
+import random
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+import aiohttp  # Import aiohttp
+import numpy as np
+from tqdm import tqdm
+
+from backend_request_func import RequestFuncInput, RequestFuncOutput
+from benchmark_dataset import RandomDataset, SampleRequest
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+async def reset_cache(reset_url: str):
+    """Sends a POST request to reset the prefix cache."""
+    logger.debug("Resetting prefix cache at %s", reset_url)
+    try:
+        async with (
+            aiohttp.ClientSession() as session,
+            session.post(reset_url) as response,
+        ):
+            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+            logger.debug("Prefix cache reset successful: %s", response.status)
+    except aiohttp.ClientConnectorError as e:
+        logger.error("Failed to connect to cache reset endpoint %s: %s}", reset_url, e)
+    except aiohttp.ClientResponseError as e:
+        logger.error(
+            "Cache reset request failed with status %s: %s", e.status, e.message
+        )
+    except Exception as e:
+        logger.error("An unexpected error occurred during cache reset: %s", e)
+
+
+async def sequential_benchmark(
+    backend: str,
+    api_url: str,
+    model_id: str,
+    tokenizer,
+    input_requests: list[SampleRequest],
+    request_func,
+    selected_percentiles: list[float],
+    cache_reset_url: Optional[str] = None,
+):
+    """
+    Benchmark that processes requests sequentially, waiting for each to complete
+    before starting the next one. Resets prefix cache between requests.
+    """
+    outputs = []
+
+    pbar = tqdm(total=len(input_requests))
+
+    benchmark_start_time = time.perf_counter()
+
+    # Process requests sequentially
+    for request in input_requests:
+        prompt, prompt_len, output_len = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+        )
+
+        logger.info("Sending request with len %s", request.prompt_len)
+        logger.debug('Request str: "%s"', request.prompt[:50])
+        request_start_time = time.perf_counter()
+
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+        )
+
+        output = await request_func(request_func_input=request_func_input)
+
+        request_end_time = time.perf_counter()
+        # Add timing information
+        if output.success and not hasattr(output, "latency"):
+            output.latency = request_end_time - request_start_time
+        logger.info("Finished request with latency %.4f s", output.latency)
+
+        outputs.append(output)
+        pbar.update(1)
+
+    pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    # Calculate metrics
+    metrics = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentiles=selected_percentiles,
+    )
+
+    print_results(metrics, benchmark_duration)
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "input_lens": [request.prompt_len for request in input_requests],
+        "output_lens": [
+            output.output_tokens if output.success else 0 for output in outputs
+        ],
+        "ttfts": [output.ttft for output in outputs if output.success],
+        "itls": [output.itl for output in outputs if output.success],
+        "generated_texts": [
+            output.generated_text for output in outputs if output.success
+        ],
+        "errors": [output.error for output in outputs if not output.success],
+    }
+
+    # Add summary statistics
+    for stat_name in ["ttft", "itl", "e2el"]:
+        for metric_name in ["mean", "median", "std"]:
+            result[f"{metric_name}_{stat_name}_ms"] = getattr(
+                metrics, f"{metric_name}_{stat_name}_ms"
+            )
+
+        for p, value in getattr(metrics, f"percentiles_{stat_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            result[f"p{p_word}_{stat_name}_ms"] = value
+
+    return result
+
+
+def calculate_metrics(
+    input_requests: list[SampleRequest],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer,
+    selected_percentiles: list[float],
+) -> BenchmarkMetrics:
+    """Calculate benchmark metrics from results."""
+    total_input = 0
+    completed = 0
+    total_output = 0
+    ttfts = []
+    itls = []
+    e2els = []
+
+    for i, output in enumerate(outputs):
+        if output.success:
+            output_len = output.output_tokens
+
+            if not output_len:
+                # Use tokenizer to count output tokens if not provided
+                output_len = len(
+                    tokenizer(output.generated_text, add_special_tokens=False).input_ids
+                )
+
+            total_output += output_len
+            total_input += input_requests[i].prompt_len
+
+            if hasattr(output, "ttft") and output.ttft is not None:
+                ttfts.append(output.ttft)
+
+            if hasattr(output, "itl") and output.itl:
+                # Ensure itl is a list of floats
+                if isinstance(output.itl, list):
+                    itls.extend(output.itl)
+                else:
+                    logger.warning(
+                        "Expected list for ITL but got %s. Appending as is.",
+                        type(output.itl),
+                    )
+                    itls.append(output.itl)
+
+            if hasattr(output, "latency") and output.latency is not None:
+                e2els.append(output.latency)
+
+            completed += 1
+
+    return BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=total_output,
+        mean_ttft_ms=np.mean(ttfts or [0]) * 1000,
+        median_ttft_ms=np.median(ttfts or [0]) * 1000,
+        std_ttft_ms=np.std(ttfts or [0]) * 1000,
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or [0], p) * 1000) for p in selected_percentiles
+        ],
+        mean_itl_ms=np.mean(itls or [0]) * 1000,
+        median_itl_ms=np.median(itls or [0]) * 1000,
+        std_itl_ms=np.std(itls or [0]) * 1000,
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or [0], p) * 1000) for p in selected_percentiles
+        ],
+        mean_e2el_ms=np.mean(e2els or [0]) * 1000,
+        median_e2el_ms=np.median(e2els or [0]) * 1000,
+        std_e2el_ms=np.std(e2els or [0]) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or [0], p) * 1000) for p in selected_percentiles
+        ],
+    )
+
+
+def print_results(metrics: BenchmarkMetrics, benchmark_duration: float):
+    """Print benchmark results in a formatted way."""
+    print("{s:{c}^{n}}".format(s=" Sequential Benchmark Result ", n=60, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+
+    def print_metric_stats(metric_name, header):
+        print("{s:{c}^{n}}".format(s=header, n=60, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_name.lower()}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_name.lower()}_ms"),
+            )
+        )
+
+        for p, value in getattr(metrics, f"percentiles_{metric_name.lower()}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+
+    print_metric_stats("TTFT", "Time to First Token")
+    print_metric_stats("ITL", "Inter-token Latency")
+    print_metric_stats("E2EL", "End-to-end Latency")
+    print("=" * 60)
+
+
+async def main_async(args):
+    # Import needed functions based on your setup
+    from backend_request_func import ASYNC_REQUEST_FUNCS
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    # Set up API URL
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+
+    # Set up Cache Reset URL
+    cache_reset_url = f"http://{args.host}:{args.port}/reset_prefix_cache"
+    logger.info("Prefix cache reset configured at: %s", cache_reset_url)
+
+    # Get tokenizer
+    tokenizer = get_tokenizer(tokenizer_id, trust_remote_code=args.trust_remote_code)
+
+    # Get request function
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    input_requests = RandomDataset().sample(
+        tokenizer=tokenizer,
+        num_requests=args.num_requests,
+        prefix_len=0,
+        input_len=args.input_len,
+        output_len=args.output_len,
+        range_ratio=0.0,
+    )
+
+    # Run benchmark
+    result = await sequential_benchmark(
+        backend=backend,
+        api_url=api_url,
+        model_id=model_id,
+        tokenizer=tokenizer,
+        input_requests=input_requests,
+        request_func=request_func,
+        selected_percentiles=[50, 90, 95, 99],
+        cache_reset_url=cache_reset_url,
+    )
+
+    return result
+
+
+def main(args):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    asyncio.run(main_async(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Sequential benchmark for LLM serving")
+    parser.add_argument(
+        "--backend", type=str, default="vllm", help="Backend to use for requests"
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server base URL (overrides --host and --port)",
+    )
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint", type=str, default="/v1/completions", help="API endpoint"
+    )
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+    parser.add_argument(
+        "--tokenizer", type=str, help="Name of the tokenizer (defaults to model name)"
+    )
+    parser.add_argument(
+        "--num-requests", type=int, default=100, help="Number of requests to process"
+    )
+    parser.add_argument(
+        "--input-len", type=int, default=128, help="Input len for generated prompts"
+    )
+    parser.add_argument(
+        "--output-len", type=int, default=None, help="Override output len for requests"
+    )
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from HuggingFace",
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
            4096,
        ]
    else:
-        batch_sizes = args.batch_size
+        batch_sizes = [args.batch_size]

    use_deep_gemm = bool(args.use_deep_gemm)

@ -728,7 +728,7 @@ if __name__ == "__main__":
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
+    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--trust-remote-code", action="store_true")
    parser.add_argument("--model-prefix", type=str, required=False)
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@ -45,6 +45,7 @@
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"

 #include "cutlass_extensions/gemm/dispatch_policy.hpp"
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -162,11 +162,10 @@ __global__ void dynamic_scaled_int8_quant_kernel(

  // calculate for absmax
  float thread_max = 0.f;
-  vectorize_read_with_alignment<16>(
-      row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
-        const float v = fabsf(static_cast<float>(src));
-        thread_max = fmaxf(thread_max, v);
-      });
+  for (int i = tid; i < hidden_size; i += stride) {
+    const auto v = fabsf(static_cast<float>(row_in[i]));
+    thread_max = fmaxf(thread_max, v);
+  }
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
@ -233,10 +232,9 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(

  // 1. calculate min & max
  MinMax thread_mm;
-  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
-                                    [&] __device__(const scalar_t& src) {
-                                      thread_mm += static_cast<float>(src);
-                                    });
+  for (int i = tid; i < hidden_size; i += stride) {
+    thread_mm += static_cast<float>(row_in[i]);
+  }

  using BlockReduce = cub::BlockReduce<MinMax, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@ -51,8 +51,7 @@ struct cutlass_3x_gemm {
  // These are the minimum alignments needed for the kernels to compile
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD =
-      128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentCD = 4;

  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@ -1,374 +0,0 @@
-#include "core/registration.h"
-
-#include <torch/all.h>
-#include <cutlass/arch/arch.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include "cute/tensor.hpp"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/group_array_problem_shape.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/reference/device/gemm.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cutlass/util/reference/host/gett.hpp"
-#include "cutlass/util/reference/host/tensor_norm.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include <cassert>
-
-using namespace cute;
-
-template <typename ElementAB, typename ElementC, typename ElementAccumulator,
-          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-__global__ void get_ggemm_starts(
-    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
-    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
-    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
-    ElementAB* b_base_as_int, ElementC* out_base_as_int,
-    ElementAccumulator* a_scale_base_as_int,
-    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
-    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
-  int expert_id = threadIdx.x;
-
-  if (expert_id >= gridDim.x * blockDim.x) {
-    return;
-  }
-
-  int m = problem_sizes[expert_id * 3];
-  int n = problem_sizes[expert_id * 3 + 1];
-  int k = problem_sizes[expert_id * 3 + 2];
-
-  int32_t expert_offset = expert_offsets[expert_id];
-  int a_stride = expert_offset * k;
-  int b_stride = expert_id * k * n;
-  int a_scale_stride = expert_offset * k / 128;
-  int b_scale_stride = expert_id * k * n / 128 / 128;
-
-  a_offsets[expert_id] = a_base_as_int + a_stride;
-  b_offsets[expert_id] = b_base_as_int + b_stride;
-  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
-  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
-  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
-
-  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
-  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
-
-  *layout_sfa_ptr =
-      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
-  *layout_sfb_ptr =
-      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
-}
-
-#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
-                                 ScaleConfig)                                 \
-  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
-    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
-                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
-        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
-        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
-        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
-        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
-        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
-        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
-        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
-        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
-        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
-        static_cast<float*>(a_scales.data_ptr()),                             \
-        static_cast<float*>(b_scales.data_ptr()),                             \
-        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
-        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
-        static_cast<int*>(problem_sizes.data_ptr()));                         \
-  }
-
-template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-void run_get_ggemm_starts(
-    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
-    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
-    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
-    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
-    torch::Tensor out_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
-    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
-  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
-  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
-
-  int num_experts = (int)expert_offsets.size(0);
-  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
-
-  if (false) {
-  }
-  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-}
-
-template <typename OutType, typename ScheduleConfig, typename LayoutD>
-void run_blockwise_scaled_group_mm(
-    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
-    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
-    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
-    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
-    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
-
-  // Types
-  using ElementA = cutlass::float_e4m3_t;
-  using ElementB = cutlass::float_e4m3_t;
-  using ElementC = OutType;
-  using ElementD = ElementC;
-  using ElementAccumulator = float;
-  using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using LayoutC = LayoutD;
-
-  // Alignments
-  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
-  using ArchTag = cutlass::arch::Sm100;
-  using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
-          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA,
-          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
-          AlignmentA, ElementB,
-          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
-          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
-
-  using GemmKernel =
-      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
-                                           CollectiveEpilogue, void>;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
-  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
-  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
-  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
-
-  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
-  int num_experts = (int)expert_offsets.size(0);
-
-  Gemm gemm_op;
-
-  // Mainloop Arguments
-  typename GemmKernel::MainloopArguments mainloop_args{
-      static_cast<const ElementA**>(a_ptrs.data_ptr()),
-      static_cast<StrideA*>(stride_a.data_ptr()),
-      static_cast<const ElementB**>(b_ptrs.data_ptr()),
-      static_cast<StrideB*>(stride_b.data_ptr()),
-      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
-          layout_sfa.data_ptr()),
-      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
-          layout_sfb.data_ptr())};
-
-  cutlass::KernelHardwareInfo hw_info;
-  hw_info.device_id = a_ptrs.get_device();
-  hw_info.sm_count =
-      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-          hw_info.device_id);
-
-  // Epilogue Arguments
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      {},  // epilogue.thread
-      nullptr,
-      static_cast<StrideC*>(stride_c.data_ptr()),
-      static_cast<ElementD**>(out_ptrs.data_ptr()),
-      static_cast<StrideC*>(stride_c.data_ptr())};
-
-  UnderlyingProblemShape* problem_sizes_as_shapes =
-      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
-
-  // Gemm Arguments
-  typename GemmKernel::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGrouped,
-      {num_experts, problem_sizes_as_shapes, nullptr},
-      mainloop_args,
-      epilogue_args,
-      hw_info};
-
-  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
-  const cudaStream_t stream =
-      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
-
-  auto can_implement_status = gemm_op.can_implement(args);
-  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
-
-  status = gemm_op.run(stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
-}
-
-template <typename OutType>
-void blockwise_scaled_group_mm_dispatch_shape(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  struct MmaConfig {
-    using ElementA = cutlass::float_e4m3_t;
-    using KernelSchedule =
-        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
-    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
-    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
-    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
-    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
-    using LayoutC = cutlass::layout::RowMajor;
-    using MmaTileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _1, _1>;
-  };
-
-  int num_experts = (int)expert_offsets.size(0);
-
-  auto a_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto out_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto a_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  auto layout_sfa = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-  auto layout_sfb = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-
-  auto stride_a = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_b = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_c = torch::full(
-      {num_experts}, output.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  torch::TensorOptions options_int =
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
-
-  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
-                       typename MmaConfig::LayoutSFB,
-                       typename MmaConfig::ScaleConfig>(
-      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
-      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
-
-  run_blockwise_scaled_group_mm<OutType, MmaConfig,
-                                typename MmaConfig::LayoutC>(
-      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
-      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
-      expert_offsets);
-}
-
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
-              "a must be kFloat8_e4m3fn");
-  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
-              "b must be kFloat8_e4m3fn");
-  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
-                  output.scalar_type() == torch::kHalf,
-              "output must be bfloat16 or half");
-  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
-              "scales_a must be float32");
-  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
-              "scales_b must be float32");
-  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
-              "expert_offsets must be int32");
-
-  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
-  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
-  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
-  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
-  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
-
-#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
-  if (output.scalar_type() == torch::kBFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else if (output.scalar_type() == torch::kFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-#endif
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_blockwise_scaled_grouped_mm",
-         &cutlass_blockwise_scaled_grouped_mm);
-}
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@ -38,6 +38,7 @@
 #include "cute/atom/mma_atom.hpp"
 #include "cute/atom/copy_traits_sm90_tma.hpp"
 #include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 #include "cutlass/pipeline/pipeline.hpp"
 #include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
@ -27,26 +27,6 @@ __device__ inline void vectorize_with_alignment(
  constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
  uintptr_t addr = reinterpret_cast<uintptr_t>(in);

-  // fast path when the whole region is already aligned
-  // Note: currently the output is guaranteed to be same as the input, so we
-  // don't check it here, comments here just for future reference.
-  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
-  if (can_vec) {
-    int num_vec = len / VEC_SIZE;
-
-    using vin_t = vec_n_t<InT, VEC_SIZE>;
-    using vout_t = vec_n_t<OutT, VEC_SIZE>;
-    auto* v_in = reinterpret_cast<const vin_t*>(in);
-    auto* v_out = reinterpret_cast<vout_t*>(out);
-
-    for (int i = tid; i < num_vec; i += stride) {
-      vout_t tmp;
-      vec_op(tmp, v_in[i]);
-      v_out[i] = tmp;
-    }
-    return;
-  }
-
  int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
  int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
  int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
@ -92,81 +72,4 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in,
                                     std::forward<ScaOp>(scalar_op));
 }

-template <int VEC_SIZE, typename InT, typename ScaOp>
-struct DefaultReadVecOp {
-  ScaOp scalar_op;
-
-  __device__ __forceinline__ void operator()(
-      const vec_n_t<InT, VEC_SIZE>& src) const {
-#pragma unroll
-    for (int i = 0; i < VEC_SIZE; ++i) {
-      scalar_op(src.val[i]);
-    }
-  }
-};
-
-// read-only version: iterate over the input with alignment guarantees
-template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
-__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
-                                                     int tid, int stride,
-                                                     VecOp&& vec_op,
-                                                     ScaOp&& scalar_op) {
-  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
-                "VEC_SIZE must be a positive power-of-two");
-  constexpr int WIDTH = VEC_SIZE * sizeof(InT);
-  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
-
-  // fast path when the whole region is already aligned
-  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
-  if (can_vec) {
-    int num_vec = len / VEC_SIZE;
-
-    using vin_t = vec_n_t<InT, VEC_SIZE>;
-    auto* v_in = reinterpret_cast<const vin_t*>(in);
-
-    for (int i = tid; i < num_vec; i += stride) {
-      vec_op(v_in[i]);
-    }
-    return;
-  }
-
-  int misalignment_offset = addr & (WIDTH - 1);
-  int alignment_bytes = WIDTH - misalignment_offset;
-  int prefix_elems = alignment_bytes & (WIDTH - 1);
-  prefix_elems /= sizeof(InT);
-  prefix_elems = min(prefix_elems, len);
-
-  // 1. handle the possibly unaligned prefix with scalar access.
-  for (int i = tid; i < prefix_elems; i += stride) {
-    scalar_op(in[i]);
-  }
-
-  in += prefix_elems;
-  len -= prefix_elems;
-
-  int num_vec = len / VEC_SIZE;
-  using vin_t = vec_n_t<InT, VEC_SIZE>;
-  auto* v_in = reinterpret_cast<const vin_t*>(in);
-
-  // 2. vectorized traversal of the main aligned region.
-  for (int i = tid; i < num_vec; i += stride) {
-    vec_op(v_in[i]);
-  }
-
-  // 3. handle remaining tail elements.
-  int tail_start = num_vec * VEC_SIZE;
-  for (int i = tid + tail_start; i < len; i += stride) {
-    scalar_op(in[i]);
-  }
-}
-
-// overload that requires only a scalar_op
-template <int VEC_SIZE, typename InT, typename ScaOp>
-__device__ __forceinline__ void vectorize_read_with_alignment(
-    const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
-  using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
-  vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
-                                          std::forward<ScaOp>(scalar_op));
-}
-
 }  // namespace vllm
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@ -59,8 +59,6 @@ void apply_repetition_penalties_(
  int vocab_size = logits.size(-1);
  int num_seqs = logits.size(0);

-  if (num_seqs == 0) return;
-
  // Get number of SMs on the current device
  int sms = 0;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@ -79,8 +79,7 @@ struct cutlass_sparse_3x_gemm {
  // These are the minimum alignments needed for the kernels to compile
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD =
-      128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentCD = 4;

  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -393,14 +393,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      {stride_tag});
  ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);

-  // cutlass blockwise scaledgroup GEMM
-  ops.def(
-      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
-      "Tensor scales_a, Tensor scales_b, "
-      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
-      {stride_tag});
-  // conditionally compiled so impl registration is in source file
-
  // cutlass nvfp4 block scaled group GEMM
  ops.def(
      "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,4 +1,3 @@
-
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.

@ -63,16 +62,12 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 ARG PIP_KEYRING_PROVIDER=disabled
 ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}

-# Flag enables build-in KV-connector dependency libs into docker images
-ARG INSTALL_KV_CONNECTORS=false
-
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG TARGETPLATFORM
-ARG INSTALL_KV_CONNECTORS=false
 ENV DEBIAN_FRONTEND=noninteractive

 ARG DEADSNAKES_MIRROR_URL
@ -281,7 +276,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-ARG INSTALL_KV_CONNECTORS=false
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
@ -491,7 +485,6 @@ RUN mv mkdocs.yaml test_docs/
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
-ARG INSTALL_KV_CONNECTORS=false

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -500,13 +493,8 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500

-COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
-
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r requirements/kv_connectors.txt; \
-    fi; \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@ -14,7 +14,7 @@ Before setting up the incremental build:
    VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
    ```

-2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
+2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.

 3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.

--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -101,49 +101,6 @@ To substitute multiple images inside the same text prompt, you can pass in a lis

 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

-If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
-
-```python
-from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-image_url = "https://picsum.photos/id/32/512/512"
-image_pil = ImageAsset('cherry_blossom').pil_image
-image_embeds = torch.load(...)
-
-conversation = [
-    {"role": "system", "content": "You are a helpful assistant"},
-    {"role": "user", "content": "Hello"},
-    {"role": "assistant", "content": "Hello! How can I assist you today?"},
-    {
-        "role": "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        },{
-            "type": "image_pil",
-            "image_pil": image_pil
-        }, {
-            "type": "image_embeds",
-            "image_embeds": image_embeds
-        }, {
-            "type": "text",
-            "text": "What's in these images?"
-        }],
-    },
-]
-
-# Perform inference and log output.
-outputs = llm.chat(conversation)
-
-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
-
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

 ??? Code
@ -271,7 +228,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.

-    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
+    For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.

 ### Image Inputs
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -470,7 +470,6 @@ Specified using `--task classify`.
 |----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
 | `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
 | `GPT2ForSequenceClassification`  | GPT2     | `nie3e/sentiment-polish-gpt2-small`    |                        |                             | ✅︎                     |
-
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.

@ -478,20 +477,12 @@ If your model is not in the above list, we will try to automatically convert the

 Specified using `--task score`.

-| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779) |
-|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------|
-| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                     |
-| `Qwen2ForSequenceClassification`      | Qwen2-based       | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc.                                | ✅︎                  |
-| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                  |
-| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                     |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                     |
-
-!!! note
-    Load the official original `mxbai-rerank-v2` by using the following command.
-
-    ```bash
-    vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
-    ```
+| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779)   |
+|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
+| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                       |
+| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                     |
+| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                       |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                       |

 !!! note
    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
@ -499,7 +490,6 @@ Specified using `--task score`.
    ```bash
    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
    ```
-
 [](){ #supported-mm-models }

 ## List of Multimodal Language Models
@ -626,6 +616,9 @@ Specified using `--task generate`.
 !!! note
    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.

+!!! note
+    `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
+
 !!! note
    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.

@ -668,8 +661,11 @@ Specified using `--task generate`.
    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.

 !!! note
-    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
-    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+    To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
+    `pip install git+https://github.com/huggingface/transformers.git`.
+
+    Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+    `--mm-processor-kwargs '{"use_audio_in_video": true}'`.

 #### Transcription

--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -6,7 +6,6 @@ import argparse

 from vllm import LLM
 from vllm.sampling_params import SamplingParams
-from vllm.assets.image import ImageAsset

 # This script is an offline demo for running Mistral-Small-3.1
 #
@ -72,16 +71,14 @@ def run_simple_demo(args: argparse.Namespace):
    )

    prompt = "Describe this image in one sentence."
+    image_url = "https://picsum.photos/id/237/200/300"

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
-                {
-                    "type": "image_pil",
-                    "image_pil": ImageAsset("cherry_blossom").pil_image,
-                },
+                {"type": "image_url", "image_url": {"url": image_url}},
            ],
        },
    ]
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@ -57,10 +57,7 @@ Once you have collected your profiles with this script, you can visualize them u
 Here are most likely the dependencies you need to install:

 ```bash
-pip install tensorflow-cpu \
-    tensorboard-plugin-profile \
-    etils \
-    importlib_resources
+pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
 ```

 Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from transformers import AutoTokenizer

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-2.7b",
+        model="Salesforce/blip2-opt-6.7b",
        limit_mm_per_prompt={modality: 1},
    )

@ -677,7 +677,6 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={modality: 1},
-        ignore_patterns=["consolidated.safetensors"],
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@ -971,7 +970,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    )


-# Qwen-VL
+# Qwen
 def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -505,7 +505,6 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
-        ignore_patterns=["consolidated.safetensors"],
    )

    placeholders = "[IMG]" * len(image_urls)
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import socket
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from typing import Optional

--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@ -13,15 +13,13 @@ vllm serve Qwen/Qwen2.5-3B-Instruct
 To serve a reasoning model, you can use the following command:

 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
-    --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
 ```

 If you want to run this script standalone with `uv`, you can use the following:

 ```bash
-uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
-    structured-output
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
 ```

 See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
@ -46,9 +44,7 @@ uv run structured_outputs.py --stream
 Run certain constraints, for example `structural_tag` and `regex`, streaming:

 ```bash
-uv run structured_outputs.py \
-    --constraint structural_tag regex \
-    --stream
+uv run structured_outputs.py --constraint structural_tag regex --stream
 ```

 Run all constraints, with reasoning models and streaming:
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@ -202,7 +202,7 @@ def parse_args():



-def deserialize(args, tensorizer_config):
+def deserialize():
    if args.lora_path:
        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
        llm = LLM(model=args.model,
@ -242,7 +242,7 @@ def deserialize(args, tensorizer_config):
    return llm


-def main():
+if __name__ == '__main__':
    args = parse_args()

    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
@ -260,6 +260,8 @@ def main():

    model_ref = args.model

+    model_name = model_ref.split("/")[1]
+
    if args.command == "serialize" or args.command == "deserialize":
        keyfile = args.keyfile
    else:
@ -307,10 +309,6 @@ def main():
                encryption_keyfile = keyfile,
                **credentials
            )
-        deserialize(args, tensorizer_config)
+        deserialize()
    else:
        raise ValueError("Either serialize or deserialize must be specified.")
-
-
-if __name__ == "__main__":
-    main()
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@ -1 +0,0 @@
-lmcache
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional

 import pytest
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@ -1,57 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import sys
-from unittest.mock import patch
-
-from vllm.config import VllmConfig
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.v1.engine.async_llm import AsyncLLM
-
-
-def test_mp_reducer(monkeypatch):
-    """
-    Test that _reduce_config reducer is registered when AsyncLLM is instantiated
-    without transformers_modules. This is a regression test for
-    https://github.com/vllm-project/vllm/pull/18640.
-    """
-
-    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
-    monkeypatch.setenv('VLLM_USE_V1', '1')
-
-    # Ensure transformers_modules is not in sys.modules
-    if 'transformers_modules' in sys.modules:
-        del sys.modules['transformers_modules']
-
-    with patch('multiprocessing.reducer.register') as mock_register:
-        engine_args = AsyncEngineArgs(
-            model="facebook/opt-125m",
-            max_model_len=32,
-            gpu_memory_utilization=0.1,
-            disable_log_stats=True,
-            disable_log_requests=True,
-        )
-
-        async_llm = AsyncLLM.from_engine_args(
-            engine_args,
-            start_engine_loop=False,
-        )
-
-        assert mock_register.called, (
-            "multiprocessing.reducer.register should have been called")
-
-        vllm_config_registered = False
-        for call_args in mock_register.call_args_list:
-            # Verify that a reducer for VllmConfig was registered
-            if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig:
-                vllm_config_registered = True
-
-                reducer_func = call_args[0][1]
-                assert callable(
-                    reducer_func), "Reducer function should be callable"
-                break
-
-        assert vllm_config_registered, (
-            "VllmConfig should have been registered to multiprocessing.reducer"
-        )
-
-        async_llm.shutdown()
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -37,6 +37,7 @@ async def test_basic_audio(mary_had_lamb):
    model_name = "openai/whisper-large-v3-turbo"
    server_args = ["--enforce-eager"]
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    prompt = "THE FIRST WORDS I SPOKE"
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
@ -47,6 +48,16 @@ async def test_basic_audio(mary_had_lamb):
            temperature=0.0)
        out = json.loads(transcription)['text']
        assert "Mary had a little lamb," in out
+        # This should "force" whisper to continue prompt in all caps
+        transcription_wprompt = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            prompt=prompt,
+            temperature=0.0)
+        out_capital = json.loads(transcription_wprompt)['text']
+        assert prompt not in out_capital


@pytest.mark.asyncio
@ -227,31 +238,3 @@ async def test_sampling_params(mary_had_lamb):
            extra_body=dict(seed=42))

        assert greedy_transcription.text != transcription.text
-
-
-@pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb):
-    model_name = "openai/whisper-large-v3-turbo"
-    server_args = ["--enforce-eager"]
-    prompt = "This is a speech, recorded in a phonograph."
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        #Prompts should not omit the part of original prompt while transcribing.
-        prefix = "The first words I spoke in the original phonograph"
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert prefix in out
-        transcription_wprompt = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            prompt=prompt,
-            temperature=0.0)
-        out_prompt = json.loads(transcription_wprompt)['text']
-        assert prefix in out_prompt
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -264,8 +264,10 @@ def test_parse_chat_messages_multiple_images(
                    "url": image_url
                }
            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
            }, {
                "type": "text",
                "text": "What's in these images?"
@ -301,8 +303,10 @@ async def test_parse_chat_messages_multiple_images_async(
                    "url": image_url
                }
            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
            }, {
                "type": "text",
                "text": "What's in these images?"
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@ -450,8 +450,7 @@ def test_multi_query_kv_attention(
            start += seq_len
        # xformers.AttentionBias to Tensor for use in reference impl.
        alibi_bias = [
-            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
-            for b, i in zip(attn_bias, seq_lens)
+            b.materialize(b.shape, device=device).squeeze() for b in attn_bias
        ]
    else:
        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -171,7 +171,7 @@ def test_env(
                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected
                else:
-                    backend = get_attn_backend(32,
+                    backend = get_attn_backend(16,
                                               torch.float16,
                                               torch.float16,
                                               block_size,
@ -180,45 +180,6 @@ def test_env(
                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected

-                    if use_v1:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   torch.float16,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        assert backend.get_name() == "FLEX_ATTENTION", (
-                            "Should fallback to FlexAttention if head size is "
-                            "not supported by FlashAttention")
-
-
-@pytest.mark.parametrize("device", ["cpu", "cuda"])
-@pytest.mark.parametrize("use_v1", [True, False])
-def test_fp32_fallback(
-    device: str,
-    use_v1: bool,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-
-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
-            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
-                    if use_v1 else "TORCH_SDPA")
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
-

 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 DeepEP test utilities
 """
@ -138,7 +137,8 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
                            low_latency_mode=low_latency_mode,
                            num_qps_per_rank=num_qps_per_rank)
    return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      num_dispatchers=pgi.world_size,
+                                      world_size=pgi.world_size,
+                                      rank=pgi.rank,
                                      dp_size=dp_size,
                                      rank_expert_offset=pgi.rank *
                                      ht_args.num_local_experts)
@ -146,6 +146,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup,

 def make_deepep_ll_a2a(pg: ProcessGroup,
                       pgi: ProcessGroupInfo,
+                       dp_size: int,
                       deepep_ll_args: DeepEPLLArgs,
                       q_dtype: Optional[torch.dtype] = None,
                       block_shape: Optional[list[int]] = None):
@ -165,7 +166,8 @@ def make_deepep_ll_a2a(pg: ProcessGroup,

    return DeepEPLLPrepareAndFinalize(
        buffer=buffer,
-        num_dispatchers=pgi.world_size,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
    )
@ -184,4 +186,5 @@ def make_deepep_a2a(pg: ProcessGroup,
                                  block_shape)

    assert deepep_ll_args is not None
-    return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
+    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
+                              block_shape)
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@ -10,7 +10,7 @@ import triton.language as tl

 from tests.kernels.moe.utils import (batched_moe,
                                     make_quantized_test_activations,
-                                     make_test_weights, naive_batched_moe)
+                                     make_test_weights, triton_moe)
 from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
@ -33,10 +33,12 @@ MNK_FACTORS = [
    (45, 512, 512),
    (45, 1024, 128),
    (45, 1024, 2048),
+    (64, 128, 128),
    (64, 512, 512),
    (64, 1024, 2048),
    (222, 128, 128),
    (222, 128, 2048),
+    (222, 512, 512),
    (222, 1024, 128),
    (222, 1024, 2048),
 ]
@ -93,12 +95,11 @@ class BatchedMMTensors:
@pytest.mark.parametrize("max_tokens_per_expert",
                         [32, 64, 128, 192, 224, 256, 512])
@pytest.mark.parametrize("K", [128, 256, 1024])
-@pytest.mark.parametrize("N", [128, 256, 1024])
-@pytest.mark.parametrize(
-    "dtype",
-    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("block_shape", [None, [128, 128]])
-@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("N", [128, 256, 512, 1024])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("block_shape", [None])
+@pytest.mark.parametrize("per_act_token_quant", [False])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
                    N: int, dtype: torch.dtype,
                    block_shape: Optional[list[int]],
@ -133,8 +134,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant,
-    )
+        per_act_token_quant=per_act_token_quant)

    B, B_q, B_scale, _, _, _ = make_test_weights(
        num_experts,
@ -143,7 +143,6 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant,
    )

    out_shape = (num_experts, max_tokens_per_expert, N)
@ -178,7 +177,6 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
            "BLOCK_SIZE_N": 16,
            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
        },
-        per_act_token_quant=per_act_token_quant,
        block_shape=block_shape,
    )

@ -187,13 +185,15 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        B,
        ref_output,
        num_expert_tokens,
+        None,
+        None,
+        None,
    )

    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
                                                      num_expert_tokens,
                                                      A_scale, B_scale,
-                                                      block_shape,
-                                                      per_act_token_quant)
+                                                      block_shape)

    rtol, atol = {
        torch.float16: (6e-2, 6e-2),
@ -201,17 +201,16 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        torch.float32: (1e-2, 1e-2),
    }[test_output.dtype]

-    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)


@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
-@pytest.mark.parametrize("per_act_token_quant", [False, True])
-@pytest.mark.parametrize("block_shape", [None, [128, 128]])
-@pytest.mark.parametrize("input_scales", [False])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("block_shape", [None])
 def test_fused_moe_batched_experts(
    m: int,
    n: int,
@ -221,19 +220,15 @@ def test_fused_moe_batched_experts(
    dtype: torch.dtype,
    per_act_token_quant: bool,
    block_shape: Optional[list[int]],
-    input_scales: bool,
 ):
    current_platform.seed_everything(7)

    use_fp8_w8a8 = dtype == torch.float8_e4m3fn

-    if topk > e:
-        pytest.skip("topk > e")
-
    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")

-    if per_act_token_quant and block_shape is not None:
+    if per_act_token_quant and block_shape is not None or topk > e:
        pytest.skip("Skip illegal quantization test.")

    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
@ -246,26 +241,27 @@ def test_fused_moe_batched_experts(
        act_dtype = dtype
        quant_dtype = None

-    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
-        e,
-        n,
-        k,
-        block_shape=block_shape,
-        in_dtype=act_dtype,
-        quant_dtype=quant_dtype,
-        per_act_token_quant=per_act_token_quant,
-    )
-
-    if input_scales and quant_dtype is not None:
-        a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
-        a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
-    else:
-        a1_scale = None
-        a2_scale = None
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+                                                 n,
+                                                 k,
+                                                 block_shape=block_shape,
+                                                 in_dtype=act_dtype,
+                                                 quant_dtype=quant_dtype)

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-
+        batched_output = batched_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
        baseline_output = torch_experts(
            a,
            w1,
@ -274,14 +270,11 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape,
-        )
+            block_shape=block_shape)

-        batched_output = naive_batched_moe(
+        triton_output = triton_moe(
            a,
            w1,
            w2,
@ -289,31 +282,14 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )

-        triton_output = batched_moe(
-            a,
-            w1,
-            w2,
-            topk_weight,
-            topk_ids,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            quant_dtype=quant_dtype,
-            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape,
-        )
-
-    torch.testing.assert_close(batched_output,
+    torch.testing.assert_close(triton_output,
                               baseline_output,
-                               atol=3e-2,
+                               atol=2e-2,
                               rtol=2e-2)

    torch.testing.assert_close(triton_output,
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@ -1,116 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# DeepGEMM Style Cutlass Grouped GEMM Test
-# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
-
-import random
-
-import pytest
-import torch
-
-from tests.kernels.utils import baseline_scaled_mm
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
-
-
-def cdiv(a, b):
-    return (a + b - 1) // b
-
-
-def per_token_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    pad_size = (128 - (n % 128)) % 128
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
-    x_view = x.view(m, -1, 128)
-    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    fp8_data = (x_view *
-                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
-    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
-
-
-def per_block_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
-                           device=x.device,
-                           dtype=x.dtype)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-
-
-@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
-    (4, 8192, 7168, 4096),
-    (4, 8192, 2048, 7168),
-    (8, 4096, 7168, 4096),
-    (8, 4096, 2048, 7168),
-    (32, 1024, 7168, 4096),
-    (32, 1024, 2048, 7168),
-])
-@pytest.mark.parametrize("out_dtype", [torch.float16])
-@pytest.mark.skipif(
-    (lambda x: x is None or x.to_int() != 100)(
-        current_platform.get_device_capability()),
-    reason="Block Scaled Grouped GEMM is only supported on SM100.")
-def test_cutlass_grouped_gemm(
-    num_groups: int,
-    expected_m_per_group: int,
-    k: int,
-    n: int,
-    out_dtype: torch.dtype,
-):
-    device = "cuda"
-    alignment = 128
-    group_ms = [
-        int(expected_m_per_group * random.uniform(0.7, 1.3))
-        for _ in range(num_groups)
-    ]
-    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
-
-    x = torch.randn((m, k), device=device, dtype=out_dtype)
-    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
-    out = torch.empty((m, n), device=device, dtype=out_dtype)
-    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
-
-    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
-    pb_size = []
-    for i in range(num_groups):
-        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
-    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
-    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
-
-    x_fp8 = per_token_cast_to_fp8(x)
-    y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
-             torch.empty((num_groups, cdiv(n, 128), k // 128),
-                         device=device,
-                         dtype=torch.float))
-    for i in range(num_groups):
-        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
-
-    for i in range(num_groups):
-        a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
-        a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
-        b = y_fp8[0][i].t()
-        b_scale = y_fp8[1][i].t()
-        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
-        ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
-
-    ops.cutlass_blockwise_scaled_grouped_mm(
-        out,
-        x_fp8[0],
-        y_fp8[0],
-        x_fp8[1],
-        y_fp8[1],
-        problem_sizes,
-        expert_offsets[:-1],
-    )
-
-    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test DeepEP + DeepGEMM integration
 DeepGEMM are gemm kernels specialized for the
@ -149,7 +148,8 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,

    fused_experts = BatchedDeepGemmExperts(
        max_num_tokens=max_tokens_per_rank,
-        num_dispatchers=pgi.world_size // dp_size,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
        block_shape=test_config.block_size,
        per_act_token_quant=test_config.per_act_token_quant)
    mk = FusedMoEModularKernel(prepare_finalize=a2a,
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test deepep dispatch-combine logic
 """
@ -155,13 +154,12 @@ def make_modular_kernel(
                        deepep_ht_args = ht_args,
                        deepep_ll_args = ll_args)

-    num_dispatchers = pgi.world_size // dp_size
-
    if low_latency_mode:
        assert not per_act_token_quant, "not supported in ll mode"
        fused_experts = BatchedTritonExperts(
            max_num_tokens=MAX_TOKENS_PER_RANK,
-            num_dispatchers=num_dispatchers,
+            world_size=pgi.world_size,
+            dp_size=dp_size,
            use_fp8_w8a8=is_quantized,
            use_int8_w8a8=False,
            use_int8_w8a16=False,
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Unit-test DeepGEMM FP8 kernels (no DeepEP).
 Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@ -14,7 +14,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
 from vllm.platforms import current_platform
-from vllm.utils import cdiv

 from .parallel_utils import ProcessGroupInfo, parallel_launch

@ -113,21 +112,18 @@ def pplx_cutlass_moe(
    w2_scale = w2_scale.to(device)
    a1_scale = a1_scale.to(device)

-    assert num_experts % world_size == 0
-    num_local_experts = cdiv(num_experts, world_size)
-    num_dispatchers = pgi.world_size // dp_size
-
    prepare_finalize = PplxPrepareAndFinalize(
        ata,
-        max_num_tokens=max_num_tokens,
-        num_local_experts=num_local_experts,
-        num_dispatchers=num_dispatchers)
+        max_num_tokens,
+        pgi.world_size,
+        rank,
+        dp_size,
+    )

-    experts = CutlassExpertsFp8(num_local_experts,
+    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
                                out_dtype,
                                per_act_token,
                                per_out_ch,
-                                num_dispatchers=num_dispatchers,
                                use_batched_format=True)

    fused_cutlass_experts = FusedMoEModularKernel(
@ -185,40 +181,35 @@ def _pplx_moe(
    per_out_ch: bool,
    use_internode: bool,
 ):
-    try:
-        if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-            torch.distributed.broadcast(uid, src=0)
-            nvshmem_init(uid, pgi.rank, pgi.world_size)
-        else:
-            group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
-            group_name = cpu_group.group_name
+    if use_internode:
+        uid = nvshmem_get_unique_id(
+        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+        torch.distributed.broadcast(uid, src=0)
+        nvshmem_init(uid, pgi.rank, pgi.world_size)
+    else:
+        group_ranks = list(range(pgi.world_size))
+        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
+        group_name = cpu_group.group_name

-        with set_current_vllm_config(vllm_config):
-            torch_output = torch_experts(a_full, w1_full, w2_full,
-                                         topk_weights, topk_ids)
-            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
-                                           w2_scale, topk_weights, topk_ids,
-                                           a1_scale, out_dtype, per_act_token,
-                                           per_out_ch, group_name)
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
+                                     topk_ids)
+        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                       w2_scale, topk_weights, topk_ids,
+                                       a1_scale, out_dtype, per_act_token,
+                                       per_out_ch, group_name)

-            torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                         pgi.world_size).to(pplx_output.device)
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pplx_output.device)

-        # Uncomment if more debugging is needed
-        # print("PPLX OUT:", pplx_output)
-        # print("TORCH OUT:", torch_output)
+    # Uncomment if more debugging is needed
+    # print("PPLX OUT:", pplx_output)
+    # print("TORCH OUT:", torch_output)

-        torch.testing.assert_close(pplx_output,
-                                   torch_output,
-                                   atol=0.05,
-                                   rtol=0)
-    finally:
-        if use_internode:
-            nvshmem_finalize()
+    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
+
+    if use_internode:
+        nvshmem_finalize()


@pytest.mark.parametrize("m", [2, 224])
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@ -4,10 +4,7 @@

 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-import itertools
-import textwrap
-import traceback
-from typing import Callable, Optional
+from typing import Optional

 import pytest
 import torch
@ -22,13 +19,12 @@ except ImportError:
    has_pplx = False

 from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
-from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
+    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
@ -42,22 +38,22 @@ requires_pplx = pytest.mark.skipif(
    reason="Requires PPLX kernels",
 )

-PPLX_COMBOS = [
-    # TODO: figure out why this fails, seems to be test problem
-    #(1, 128, 128),
+PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
+                       (222, 2048, 1024)]
+
+PPLX_MOE_COMBOS = [
+    (1, 128, 128),
    (2, 128, 512),
    (3, 1024, 2048),
-    (4, 128, 128),
-    (32, 1024, 512),
+    (32, 128, 1024),
    (45, 512, 2048),
-    (64, 1024, 512),
-    (222, 2048, 1024),
-    (256, 1408, 2048),
+    (64, 1024, 1024),
+    (222, 1024, 2048),
 ]

 NUM_EXPERTS = [8, 64]
+EP_SIZE = [1, 4]
 TOP_KS = [1, 2, 6]
-DTYPES = [torch.float8_e4m3fn, torch.bfloat16]

 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@ -173,11 +169,9 @@ def test_fused_moe_batched_experts(

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight,
-                                        topk_ids)  # only for baseline
+        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
        torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = naive_batched_moe(
-            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this
+        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)

    torch.testing.assert_close(baseline_output,
                               torch_output,
@ -189,63 +183,6 @@ def test_fused_moe_batched_experts(
                               rtol=0)


-def create_pplx_prepare_finalize(
-    num_tokens: int,
-    hidden_dim: int,
-    topk: int,
-    num_experts: int,
-    rank: int,
-    dp_size: int,
-    world_size: int,
-    in_dtype: torch.dtype,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
-    per_act_token_quant: bool,
-    group_name: Optional[str],
-):
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
-
-    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
-    num_local_experts = rank_chunk(num_experts, 0, world_size)
-
-    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
-        max_num_tokens,
-        hidden_dim,
-        in_dtype,
-        quant_dtype,
-        per_act_token_quant=per_act_token_quant,
-        block_shape=block_shape,
-    )
-
-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim_bytes,
-        hidden_dim_scale_bytes=scale_bytes,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)
-
-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens=max_num_tokens,
-        num_local_experts=num_local_experts,
-        num_dispatchers=world_size // dp_size,
-    )
-
-    return prepare_finalize, ata
-
-
 def rank_chunk(num: int, r: int, w: int) -> int:
    rem = num % w
    return (num // w) + (1 if r < rem else 0)
@ -256,35 +193,6 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
    return t[(r * chunk):(r + 1) * chunk]


-def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
-                        w: int) -> Optional[torch.Tensor]:
-    if t is not None:
-        return chunk_by_rank(t, r, w)
-    else:
-        return t
-
-
-def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
-                         w: int) -> Optional[torch.Tensor]:
-    if t is not None and t.numel() > 1:
-        chunk = rank_chunk(t.shape[0], r, w)
-        return t[(r * chunk):(r + 1) * chunk]
-    else:
-        return t
-
-
-def chunk_scales(t: Optional[torch.Tensor], start: int,
-                 end: int) -> Optional[torch.Tensor]:
-    if t is not None and t.numel() > 1:
-        return t[start:end]
-    else:
-        return t
-
-
-def dummy_work(a: torch.Tensor) -> torch.Tensor:
-    return a * 1.1
-
-
 def pplx_prepare_finalize(
    pgi: ProcessGroupInfo,
    dp_size: int,
@ -292,11 +200,11 @@ def pplx_prepare_finalize(
    topk_weight: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
-    per_act_token_quant: bool,
    group_name: Optional[str],
 ) -> torch.Tensor:
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+
    assert torch.cuda.current_device() == pgi.local_rank

    topk = topk_ids.shape[1]
@ -304,66 +212,60 @@ def pplx_prepare_finalize(
    device = pgi.device
    rank = pgi.rank
    world_size = pgi.world_size
+    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
+
+    args = dict(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
+        hidden_dim_scale_bytes=0,
+    )
+
+    if group_name is None:
+        ata = AllToAll.internode(**args)
+    else:
+        args["group_name"] = group_name
+        ata = AllToAll.intranode(**args)

    topk_ids = topk_ids.to(dtype=torch.uint32)

-    prepare_finalize, ata = create_pplx_prepare_finalize(
-        num_tokens,
-        hidden_dim,
-        topk,
-        num_experts,
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens,
+        world_size,
        rank,
        dp_size,
-        world_size,
-        a.dtype,
-        quant_dtype,
-        block_shape,
-        per_act_token_quant,
-        group_name,
    )

-    assert a.shape[0] == topk_ids.shape[0]
-
    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)

-    assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
-
-    out = torch.full(
-        a_chunk.shape,
-        torch.nan,
-        dtype=a.dtype,
-        device=device,
-    )
-
-    if (quant_dtype is not None and not per_act_token_quant
-            and block_shape is None):
-        a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    else:
-        a1_scale = None
-        a2_scale = None
-
    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
        a_chunk,
-        a1_scale,
-        a2_scale,
+        None,
+        None,
        chunk_topk_weight,
        chunk_topk_ids,
        num_experts,
        None,
        False,
-        FusedMoEQuantConfig(
-            quant_dtype,
-            per_act_token_quant,
-            False,
-            block_shape,
-        ),
+        FusedMoEQuantConfig(),
    )

-    b_a = dummy_work(
-        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
+    b_a = b_a * 1.5
+
+    out = torch.full(
+        (max_num_tokens, hidden_dim),
+        torch.nan,
+        dtype=a.dtype,
+        device=device,
+    )

    prepare_finalize.finalize(
        out,
@ -389,96 +291,70 @@ def _pplx_prepare_finalize(
    score: torch.Tensor,
    topk: torch.Tensor,
    num_experts: int,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
-    per_act_token_quant: bool,
    use_internode: bool,
 ):
-    try:
-        if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-            torch.distributed.broadcast(uid, src=0)
-            nvshmem_init(uid, pgi.rank, pgi.world_size)
-            group_name = None
-        else:
-            group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
-            group_name = cpu_group.group_name
+    if use_internode:
+        uid = nvshmem_get_unique_id(
+        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+        torch.distributed.broadcast(uid, src=0)
+        nvshmem_init(uid, pgi.rank, pgi.world_size)
+        group_name = None
+    else:
+        group_ranks = list(range(pgi.world_size))
+        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
+        group_name = cpu_group.group_name

-        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        m, k = a.shape
+    device = pgi.device

-        a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
+    topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+    k = a.shape[1]

-        torch_output = (a_rep.view(m, topk, k) *
-                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
-                            dim=1)
+    a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)

-        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
-                                            topk_ids, num_experts, quant_dtype,
-                                            block_shape, per_act_token_quant,
-                                            group_name)
+    torch_output = (a_rep.view(-1, topk, k) * 1.5 *
+                    topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
+                        a.dtype)

-        torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                     pgi.world_size).to(pgi.device)
+    pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
+                                        num_experts, group_name)

-        torch.testing.assert_close(pplx_output,
-                                   torch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
-    finally:
-        if use_internode:
-            nvshmem_finalize()
+    torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                 pgi.world_size).to(pplx_output.device)
+
+    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
+
+    if use_internode:
+        nvshmem_finalize()


-@pytest.mark.parametrize("mnk", PPLX_COMBOS)
+# TODO (bnell): this test point does not work for odd M due to how the test is
+# written, not due to limitations of the pplx kernels.  The pplx_moe
+# test below is able to deal with odd M.
+# TODO (bnell) add fp8 tests
+@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("per_act_token_quant", [False, True])
-@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
-@pytest.mark.optional
@requires_pplx
-def test_pplx_prepare_finalize_slow(
+def test_pplx_prepare_finalize(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
    dtype: torch.dtype,
    world_dp_size: tuple[int, int],
-    per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
    use_internode: bool,
 ):
-    if dtype == torch.float8_e4m3fn:
-        use_fp8_w8a8 = True
-        act_dtype = torch.bfloat16
-        quant_dtype = dtype
-    else:
-        use_fp8_w8a8 = False
-        act_dtype = dtype
-        quant_dtype = None
-
-    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
-        pytest.skip("Skip quantization test for non-quantized type")
-
-    if per_act_token_quant and block_shape is not None:
-        pytest.skip("Skip illegal quantization combination")
-
    current_platform.seed_everything(7)
    m, n, k = mnk
    world_size, dp_size = world_dp_size
    device = "cuda"
-
-    a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
-    score = torch.randn((m, e), device=device, dtype=act_dtype)
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+    score = torch.randn((m, e), device=device, dtype=dtype)

    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
-                    topk, e, quant_dtype, block_shape, per_act_token_quant,
-                    use_internode)
+                    topk, e, use_internode)


 def pplx_moe(
@ -493,62 +369,84 @@ def pplx_moe(
    topk_ids: torch.Tensor,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    qtype: Optional[torch.dtype] = None,
    per_act_token_quant=False,
    block_shape: Optional[list[int]] = None,
    use_compile: bool = False,
    use_cudagraphs: bool = True,
 ) -> torch.Tensor:
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)

-    num_tokens, hidden_dim = a.shape
+    device = torch.device("cuda", rank)
+    hidden_dim = a.shape[1]
    num_experts = w1.shape[0]
    topk = topk_ids.shape[1]
-    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
+    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)

-    prepare_finalize, ata = create_pplx_prepare_finalize(
-        num_tokens,
+    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
        hidden_dim,
-        topk,
-        num_experts,
-        rank,
-        dp_size,
-        world_size,
        a.dtype,
-        quant_dtype,
-        block_shape,
-        per_act_token_quant,
-        group_name,
+        qtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
    )

+    args = dict(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim_bytes,
+        hidden_dim_scale_bytes=scale_bytes,
+    )
+
+    if group_name is None:
+        ata = AllToAll.internode(**args)
+    else:
+        args["group_name"] = group_name
+        ata = AllToAll.intranode(**args)
+
    topk_ids = topk_ids.to(dtype=torch.uint32)

-    experts = BatchedTritonExperts(
-        max_num_tokens=max_num_tokens,
-        num_dispatchers=prepare_finalize.num_dispatchers(),
-        use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
-        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant,
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens,
+        world_size,
+        rank,
+        dp_size,
    )

+    experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
+                                   world_size=world_size,
+                                   dp_size=dp_size,
+                                   use_fp8_w8a8=qtype == torch.float8_e4m3fn,
+                                   block_shape=block_shape)
+
    fused_experts = FusedMoEModularKernel(
        prepare_finalize,
        experts,
    )

    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)

    # Chunking weights like this only works for batched format
-    w1_chunk = chunk_by_rank(w1, rank, world_size)
-    w2_chunk = chunk_by_rank(w2, rank, world_size)
-    w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
-    w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
-    a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
-    a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
+    w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
+    w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
+
+    if w1_scale is not None:
+        w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
+        w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
+    else:
+        w1_scale_chunk = None
+        w2_scale_chunk = None

    # Note: for now use_compile will error out if the problem size is
    # large enough to trigger chunking. I'm leaving the flag and
@ -570,8 +468,6 @@ def pplx_moe(
                         chunk_topk_ids,
                         w1_scale=w1_scale_chunk,
                         w2_scale=w2_scale_chunk,
-                         a1_scale=a1_scale_chunk,
-                         a2_scale=a2_scale_chunk,
                         global_num_experts=num_experts)

    if use_cudagraphs:
@ -586,8 +482,6 @@ def pplx_moe(
                                 chunk_topk_ids,
                                 w1_scale=w1_scale_chunk,
                                 w2_scale=w2_scale_chunk,
-                                 a1_scale=a1_scale_chunk,
-                                 a2_scale=a2_scale_chunk,
                                 global_num_experts=num_experts)

        torch.cuda.synchronize()
@ -600,6 +494,48 @@ def pplx_moe(
    return out


+def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
+    assert torch.cuda.current_device() == pgi.local_rank
+
+    num_experts = w1.shape[0]
+    device = pgi.device
+    rank = pgi.rank
+    world_size = pgi.world_size
+    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
+
+    prepare_finalize = BatchedPrepareAndFinalize(
+        max_num_tokens=max_num_tokens,
+        world_size=world_size,
+        dp_size=dp_size,
+        rank=rank,
+    )
+
+    experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
+                                  world_size=1,
+                                  dp_size=1)
+
+    fused_experts = FusedMoEModularKernel(
+        prepare_finalize,
+        experts,
+    )
+
+    # Note: workers with the same dp_rank must use the exact same inputs.
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
+
+    out = fused_experts(
+        a_chunk,
+        # Chunking weights like this only works for batched format
+        chunk_by_rank(w1, rank, world_size).to(device),
+        chunk_by_rank(w2, rank, world_size).to(device),
+        chunk_topk_weight,
+        chunk_topk_ids,
+        global_num_experts=num_experts)
+
+    return out
+
+
 def _pplx_moe(
    pgi: ProcessGroupInfo,
    dp_size: int,
@ -608,130 +544,75 @@ def _pplx_moe(
    w2: torch.Tensor,
    score: torch.Tensor,
    topk: int,
-    num_experts: int,
    w1_s: Optional[torch.Tensor] = None,
    w2_s: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    qtype: Optional[torch.dtype] = None,
    per_act_token_quant: bool = False,
    block_shape: Optional[list[int]] = None,
    use_internode: bool = False,
 ):
-    try:
-        if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-            torch.distributed.broadcast(uid, src=0)
-            nvshmem_init(uid, pgi.rank, pgi.world_size)
-            group_name = None
-        else:
-            group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
-            group_name = cpu_group.group_name
+    if use_internode:
+        uid = nvshmem_get_unique_id(
+        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+        torch.distributed.broadcast(uid, src=0)
+        nvshmem_init(uid, pgi.rank, pgi.world_size)
+        group_name = None
+    else:
+        group_ranks = list(range(pgi.world_size))
+        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
+        group_name = cpu_group.group_name

-        m, k = a.shape
-        e, _, n = w2.shape
+    m, k = a.shape
+    e, _, n = w2.shape

-        moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
+    moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)

-        device = torch.device("cuda", pgi.rank)
-        rank = pgi.rank
-        world_size = pgi.world_size
+    device = torch.device("cuda", pgi.rank)
+    a = a.to(device)
+    w1 = w1.to(device)
+    w2 = w2.to(device)
+    w1_s = w1_s.to(device) if w1_s is not None else None
+    w2_s = w2_s.to(device) if w2_s is not None else None

-        a = a.to(device)
-        w1 = w1.to(device)
-        w2 = w2.to(device)
-        w1_s = w1_s.to(device) if w1_s is not None else None
-        w2_s = w2_s.to(device) if w2_s is not None else None
+    with set_current_vllm_config(vllm_config), override_config(moe_config):
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        torch_output = torch_experts(a,
+                                     w1,
+                                     w2,
+                                     topk_weight,
+                                     topk_ids,
+                                     w1_scale=w1_s,
+                                     w2_scale=w2_s,
+                                     quant_dtype=qtype,
+                                     per_act_token_quant=per_act_token_quant,
+                                     block_shape=block_shape)
+        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
+                               a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
+                               qtype, per_act_token_quant, block_shape)
+        # TODO (bnell): fix + re-enable
+        #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
+        #                              topk_ids)

-        if (quant_dtype is not None and not per_act_token_quant
-                and block_shape is None):
-            a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-            a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        else:
-            a1_scale = None
-            a2_scale = None
+    torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                 pgi.world_size).to(pplx_output.device)

-        with set_current_vllm_config(vllm_config), override_config(moe_config):
-            topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
+    #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)

-            torch_output = torch_experts(
-                a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                w1_scale=w1_s,
-                w2_scale=w2_s,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-            )
-
-            batched_output = naive_batched_moe(
-                a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                w1_scale=w1_s,
-                w2_scale=w2_s,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-            )
-
-            pplx_output = pplx_moe(
-                group_name,
-                rank,
-                world_size,
-                dp_size,
-                a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                w1_scale=w1_s,
-                w2_scale=w2_s,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-            )
-
-        chunked_batch_output = chunk_by_rank(
-            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
-
-        torch.testing.assert_close(batched_output,
-                                   torch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
-
-        torch.testing.assert_close(pplx_output,
-                                   chunked_batch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
-    finally:
-        if use_internode:
-            nvshmem_finalize()
+    if use_internode:
+        nvshmem_finalize()


-@pytest.mark.parametrize("mnk", PPLX_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
-@pytest.mark.optional
@requires_pplx
-def test_pplx_moe_slow(
+def test_pplx_moe(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
@ -752,143 +633,18 @@ def test_pplx_moe_slow(
        use_fp8_w8a8 = False
        quant_dtype = None

-    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+    if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
        pytest.skip("Skip quantization test for non-quantized type")

-    if per_act_token_quant and block_shape is not None:
-        pytest.skip("Skip illegal quantization combination")
-
    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)

-    _, w1, w1_s, _, w2, w2_s = make_test_weights(
-        e,
-        n,
-        k,
-        quant_dtype=quant_dtype,
-        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant,
-    )
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+                                                 n,
+                                                 k,
+                                                 quant_dtype=quant_dtype,
+                                                 block_shape=block_shape)

-    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
+    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
                    use_internode)
-
-
-def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
-                    make_weights: bool, test_fn: Callable):
-
-    def format_result(msg, ex=None):
-        if ex is not None:
-            x = str(ex)
-            newx = x.strip(" \n\t")[:16]
-            if len(newx) < len(x):
-                newx = newx + " ..."
-
-            prefix = "E\t"
-            print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
-            print(f"FAILED {msg} - {newx}\n")
-        else:
-            print(f"PASSED {msg}")
-
-    current_platform.seed_everything(7)
-    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
-                               [False, True], [None, [128, 128]])
-    exceptions = []
-    count = 0
-    for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
-        count = count + 1
-        m, n, k = mnk
-
-        if dtype == torch.float8_e4m3fn:
-            use_fp8_w8a8 = True
-            quant_dtype = dtype
-        else:
-            use_fp8_w8a8 = False
-            quant_dtype = None
-
-        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
-                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
-                     f"block_shape={block_shape}")
-
-        if not use_fp8_w8a8 and (per_act_token_quant
-                                 or block_shape is not None):
-            print(
-                f"{test_desc} - Skip quantization test for non-quantized type."
-            )
-            continue
-
-        if per_act_token_quant and block_shape is not None:
-            print(f"{test_desc} - Skip illegal quantization combination.")
-            continue
-
-        a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
-        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-
-        args = dict()
-        if make_weights:
-            _, w1, w1_s, _, w2, w2_s = make_test_weights(
-                e,
-                n,
-                k,
-                quant_dtype=quant_dtype,
-                block_shape=block_shape,
-                per_act_token_quant=per_act_token_quant,
-            )
-            args["w1"] = w1
-            args["w2"] = w2
-            args["w1_s"] = w1_s
-            args["w2_s"] = w2_s
-
-        try:
-            test_fn(
-                pgi=pgi,
-                dp_size=dp_size,
-                a=a,
-                score=score,
-                topk=topk,
-                num_experts=e,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-                use_internode=use_internode,
-                **args,
-            )
-            format_result(test_desc)
-        except Exception as ex:
-            format_result(test_desc, ex)
-            exceptions.append(ex)
-
-    if len(exceptions) > 0:
-        raise RuntimeError(
-            f"{len(exceptions)} of {count} tests failed in child process, "
-            f"rank={pgi.rank}.")
-    else:
-        print(f"{count} of {count} tests passed in child process, "
-              f"rank={pgi.rank}.")
-
-
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("use_internode", [False])
-@requires_pplx
-def test_pplx_prepare_finalize(
-    world_dp_size: tuple[int, int],
-    use_internode: bool,
-):
-    current_platform.seed_everything(7)
-    world_size, dp_size = world_dp_size
-    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
-                    use_internode, False, _pplx_prepare_finalize)
-
-
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("use_internode", [False])
-@requires_pplx
-def test_pplx_moe(
-    world_dp_size: tuple[int, int],
-    use_internode: bool,
-):
-    current_platform.seed_everything(7)
-    world_size, dp_size = world_dp_size
-    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
-                    _pplx_moe)
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@ -63,12 +63,13 @@ def batched_moe(

    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  num_dispatchers=1,
-                                  num_local_experts=w1.shape[0],
+                                  world_size=1,
+                                  dp_size=1,
                                  rank=0),
        BatchedTritonExperts(
            max_num_tokens=max_num_tokens,
-            num_dispatchers=1,
+            world_size=1,
+            dp_size=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
@ -104,12 +105,13 @@ def naive_batched_moe(

    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  num_dispatchers=1,
-                                  num_local_experts=w1.shape[0],
+                                  world_size=1,
+                                  dp_size=1,
                                  rank=0),
        NaiveBatchedExperts(
            max_num_tokens=max_num_tokens,
-            num_dispatchers=1,
+            dp_size=1,
+            world_size=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@ -277,24 +277,6 @@ def dequant(
        return t.to(out_dtype)


-def batched_dequant(
-    t: torch.Tensor,
-    scale: Optional[torch.Tensor],
-    block_shape: Optional[list[int]],
-    per_act_token_quant: bool,
-    out_dtype: Optional[torch.dtype] = torch.float32,
-) -> torch.Tensor:
-    if scale is not None:
-        assert t.shape[0] == scale.shape[0]
-        out = torch.empty_like(t, dtype=out_dtype)
-        for e in range(t.shape[0]):
-            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
-                             out_dtype)
-        return out
-
-    return t.to(out_dtype)
-
-
 def native_batched_masked_quant_matmul(
    A: torch.Tensor,
    B: torch.Tensor,
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch

@ -75,51 +74,3 @@ def test_apply_repetition_penalties(
    # Test the operator by applying the opcheck utility
    opcheck(torch.ops._C.apply_repetition_penalties_,
            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
-
-
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test for checking CUDA kernel")
-@torch.inference_mode()
-def test_apply_repetition_penalties_zero_seqs() -> None:
-    """
-    Test the apply_repetition_penalties custom op with num_seqs=0
-    against a reference implementation.
-    """
-    num_seqs = 0
-    vocab_size = 17
-    repetition_penalty = 1.05
-    dtype = torch.float32
-    seed = 0
-
-    current_platform.seed_everything(seed)
-    torch.set_default_device("cuda:0")
-
-    # Create test data
-    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
-
-    # Create masks with some random tokens marked as repeated
-    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
-    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
-
-    # No tokens to mark as repeated since num_seqs=0
-
-    # Create repetition penalties tensor
-    repetition_penalties = torch.full((num_seqs, ),
-                                      repetition_penalty,
-                                      dtype=dtype)
-
-    # Run all three implementations
-    logits_torch = logits.clone()
-    logits_cuda = logits.clone()
-
-    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
-                                     repetition_penalties)
-    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
-                                    repetition_penalties)
-
-    # Compare all outputs to reference
-    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
-
-    # Test the operator by applying the opcheck utility
-    opcheck(torch.ops._C.apply_repetition_penalties_,
-            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Integration tests for FlexAttention backend vs default backend"""

 import random
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@ -1094,8 +1094,6 @@ def torch_experts(
    if expert_map is not None:
        topk_ids = expert_map[topk_ids]

-    f32 = torch.float32
-
    for i in range(num_experts):
        mask = topk_ids == i
        if mask.sum():
@ -1111,8 +1109,7 @@ def torch_experts(
                                                out.dtype)
                tmp2 = SiluAndMul()(tmp1)
                tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
-                    block_shape)
+                    tmp2, None, quant_dtype, per_act_token_quant, block_shape)

                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
                                                     w2_scale[i], block_shape,
@ -1120,6 +1117,7 @@ def torch_experts(
            else:
                assert (a_scale is not None and w1_scale is not None
                        and w2_scale is not None)
+                f32 = torch.float32
                scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
                tmp1 = a[mask].to(f32) * scales
                w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
@ -1128,8 +1126,8 @@ def torch_experts(
                w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                out[mask] = (tmp2 @ w2_dq).to(out.dtype)

-    return (out.view(M, -1, w2.shape[1]).to(f32) *
-            topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
+    return (out.view(M, -1, w2.shape[1]) *
+            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)


 def torch_moe(a: torch.Tensor,
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -249,6 +249,23 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
           model_runner.model)


+@pytest.fixture(params=[True, False])
+def run_with_both_engines_lora(request, monkeypatch):
+    # Automatically runs tests twice, once with V1 and once without
+    use_v1 = request.param
+    # Tests decorated with `@skip_v1` are only run without v1
+    skip_v1 = request.node.get_closest_marker("skip_v1")
+
+    if use_v1:
+        if skip_v1:
+            pytest.skip("Skipping test on vllm V1")
+        monkeypatch.setenv('VLLM_USE_V1', '1')
+    else:
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+    yield
+
+
@pytest.fixture
 def reset_default_device():
    """
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -3,7 +3,6 @@

 import pytest

-from tests.models.registry import HF_EXAMPLE_MODELS
 from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
@ -20,55 +19,31 @@ pytestmark = pytest.mark.hybrid_model
 SSM_MODELS = [
    "state-spaces/mamba-130m-hf",
    "tiiuae/falcon-mamba-tiny-dev",
+    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
+    # doesn't compare vLLM output with HF output.
+    # See https://github.com/huggingface/transformers/pull/35943
    "mistralai/Mamba-Codestral-7B-v0.1",
 ]

 HYBRID_MODELS = [
    "ai21labs/Jamba-tiny-dev",
+    # NOTE: Currently the test failes due to HF transformers issue fixed in:
+    # https://github.com/huggingface/transformers/pull/39033
+    # We will enable vLLM test for Granite after next HF transformers release.
+    # "ibm-granite/granite-4.0-tiny-preview",
    # NOTE: Running Plamo2 in transformers implementation requires to install
    # causal-conv1d package, which is not listed as a test dependency as it's
    # not compatible with pip-compile.
    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
    "hmellor/tiny-random-BambaForCausalLM",
-    "ibm-ai-platform/Bamba-9B-v1",
-    "nvidia/Nemotron-H-8B-Base-8K",
-    "ibm-granite/granite-4.0-tiny-preview",
-    "tiiuae/Falcon-H1-0.5B-Base",
-]
-
-HF_UNSUPPORTED_MODELS = [
-    # The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
-    "mistralai/Mamba-Codestral-7B-v0.1",
-    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
-    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
-    "nvidia/Nemotron-H-8B-Base-8K",
-    # NOTE: Currently the test fails due to HF transformers issue fixed in:
-    # https://github.com/huggingface/transformers/pull/39033
-    # We will enable vLLM test for Granite after next HF transformers release.
-    "ibm-granite/granite-4.0-tiny-preview",
 ]

 V1_SUPPORTED_MODELS = [
    "mistralai/Mamba-Codestral-7B-v0.1",
-    "ibm-ai-platform/Bamba-9B-v1",
-    "Zyphra/Zamba2-1.2B-instruct",
-    "nvidia/Nemotron-H-8B-Base-8K",
-    "ibm-granite/granite-4.0-tiny-preview",
-    "tiiuae/Falcon-H1-0.5B-Base",
 ]

-ATTN_BLOCK_SIZES = {
-    "ibm-ai-platform/Bamba-9B-v1": 528,
-    "Zyphra/Zamba2-1.2B-instruct": 80,
-    "nvidia/Nemotron-H-8B-Base-8K": 528,
-    "ibm-granite/granite-4.0-tiny-preview": 400,
-    "tiiuae/Falcon-H1-0.5B-Base": 800,
-}
-
 # Avoid OOM
 MAX_NUM_SEQS = 4

@ -85,16 +60,8 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-
-    try:
-        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
-        model_info.check_available_online(on_fail="skip")
-        model_info.check_transformers_version(on_fail="skip")
-    except ValueError:
-        pass
-
    with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
+        if model != "mistralai/Mamba-Codestral-7B-v0.1":
            hf_outputs = hf_model.generate_greedy_logprobs_limit(
                example_prompts, max_tokens, num_logprobs)
        else:
@ -105,21 +72,12 @@ def test_models(
            example_prompts, max_tokens, num_logprobs)

    if model in V1_SUPPORTED_MODELS:
-        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
-            block_size = ATTN_BLOCK_SIZES[model]
-        else:
-            block_size = 16
-
        with monkeypatch.context() as m:
            m.setenv("VLLM_USE_V1", "1")
-            if model in HYBRID_MODELS:
-                # required due to reorder_batch behaviour
-                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
            with vllm_runner(model,
                             max_num_seqs=MAX_NUM_SEQS,
                             enforce_eager=True,
-                             enable_prefix_caching=False,
-                             block_size=block_size) as vllm_model:
+                             enable_prefix_caching=False) as vllm_model:
                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                    example_prompts, max_tokens, num_logprobs)
    else:
@ -153,14 +111,6 @@ def test_batching(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-
-    try:
-        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
-        model_info.check_available_online(on_fail="skip")
-        model_info.check_transformers_version(on_fail="skip")
-    except ValueError:
-        pass
-
    for_loop_outputs = []
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        for prompt in example_prompts:
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
-from typing import Optional

 import pytest

@ -75,13 +74,6 @@ def test_models(
        vllm_extra_kwargs["override_pooler_config"] = \
            PoolerConfig(pooling_type="MEAN", normalize=False)

-    max_model_len: Optional[int] = 512
-    if model in [
-            "sentence-transformers/all-MiniLM-L12-v2",
-            "sentence-transformers/stsb-roberta-base-v2"
-    ]:
-        max_model_len = None
-
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
@ -95,7 +87,7 @@ def test_models(

    with vllm_runner(model,
                     task="embed",
-                     max_model_len=max_model_len,
+                     max_model_len=512,
                     **vllm_extra_kwargs) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)

--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@ -56,16 +56,10 @@ MODELS = [
                   enable_test=False),
 ]

-V1FlashAttentionImpNotSupported = [
-    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
-]
-

@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                           monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:

    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":
@ -77,10 +71,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,

@pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo, example_prompts,
-                                  monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:

    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

 from ...utils import EmbedModelInfo
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
-
-import pytest
-import torch
-
-from tests.conftest import HfRunner
-
-from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
-
-RERANK_MODELS = [
-    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
-                    architecture="Qwen2ForSequenceClassification",
-                    dtype="float32",
-                    enable_test=True),
-    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
-                    architecture="Qwen2ForSequenceClassification",
-                    dtype="float32",
-                    enable_test=False)
-]
-
-
-class MxbaiRerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
-        self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
-        self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
-
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
-        def process_inputs(pairs):
-            inputs = self.tokenizer(pairs,
-                                    padding=False,
-                                    truncation='longest_first',
-                                    return_attention_mask=False)
-            for i, ele in enumerate(inputs['input_ids']):
-                inputs['input_ids'][i] = ele
-            inputs = self.tokenizer.pad(inputs,
-                                        padding=True,
-                                        return_tensors="pt")
-            for key in inputs:
-                inputs[key] = inputs[key].to(self.model.device)
-            return inputs
-
-        @torch.no_grad()
-        def compute_logits(inputs):
-            logits = self.model(**inputs).logits[:, -1, :]
-            yes_logits = logits[:, self.yes_loc]
-            no_logits = logits[:, self.no_loc]
-            logits = yes_logits - no_logits
-            scores = logits.float().sigmoid()
-            return scores
-
-        scores = []
-        for prompt in prompts:
-            inputs = process_inputs([prompt])
-            score = compute_logits(inputs)
-            scores.append(score[0].item())
-        return torch.Tensor(scores)
-
-
-@pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "Qwen2ForSequenceClassification":
-        vllm_extra_kwargs["hf_overrides"] = {
-            "architectures": ["Qwen2ForSequenceClassification"],
-            "classifier_from_token": ["0", "1"],
-            "method": "from_2_way_softmax",
-        }
-
-    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -33,6 +33,9 @@ if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

 REQUIRES_V0_MODELS = [
+    # V1 Test: no way to fall back for head_dim = 80
+    # https://github.com/vllm-project/vllm/issues/14524
+    "qwen_vl",
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
 ]
@ -218,7 +221,8 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        models=["Salesforce/blip2-opt-2.7b"],
+        # TODO: Change back to 2.7b once head_dim = 80 is supported
+        models=["Salesforce/blip2-opt-6.7b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@ -336,7 +340,8 @@ VLM_TEST_SETTINGS = {
    "h2ovl": VLMTestInfo(
        models = [
            "h2oai/h2ovl-mississippi-800m",
-            "h2oai/h2ovl-mississippi-2b",
+            # TODO: Re-enable once head_dim = 80 is supported
+            # "h2oai/h2ovl-mississippi-2b",
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@ -83,7 +83,7 @@ MODELS = [
    QWEN2_CONFIG,
    PHI3_CONFIG,
    GPT2_CONFIG,
-    STABLELM_CONFIG,
+    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
    DOLPHIN_CONFIG,
    # STARCODER_CONFIG, # broken
 ]
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -169,7 +169,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
                                          min_transformers_version="4.53"),
    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
@ -240,9 +240,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                        trust_remote_code=True),
    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
-    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
-    # Blocksparse attention not supported in V1 yet
    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
                                            trust_remote_code=True,
                                            v0_only=True),
@ -259,8 +258,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
-    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
+                                                v0_only=True),
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
+                                           v0_only=True),
    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@ -329,7 +330,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"},  # noqa: E501
+                                                     v0_only=True),
    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
@ -357,7 +359,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True),
+                                                      trust_remote_code=True,
+                                                      v0_only=True),
    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -22,8 +22,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    model_info.check_transformers_version(on_fail="skip")

    # FIXME: Possible memory leak in the previous tests?
-    if model_arch in ("GraniteSpeechForConditionalGeneration",
-                      "KimiVLForConditionalGeneration"):
+    if model_arch == "GraniteSpeechForConditionalGeneration":
        pytest.skip("Avoid OOM")

    # Avoid OOM and reduce initialization time by only using 1 layer
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright © 2025, Oracle and/or its affiliates.
 """Tests RTN quantization startup and generation, 
 doesn't test correctness
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -20,11 +20,10 @@ from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                        MemorySnapshot, PlaceholderModule, StoreBoolean,
                        bind_kv_cache, common_broadcastable_dtype,
-                        deprecate_kwargs, get_open_port, get_tcp_uri,
-                        is_lossless_cast, join_host_port, make_zmq_path,
-                        make_zmq_socket, memory_profiling,
-                        merge_async_iterators, sha256, split_host_port,
-                        split_zmq_path, supports_kw, swap_dict_values)
+                        deprecate_kwargs, get_open_port, is_lossless_cast,
+                        make_zmq_path, make_zmq_socket, memory_profiling,
+                        merge_async_iterators, sha256, split_zmq_path,
+                        supports_kw, swap_dict_values)

 from .utils import create_new_process_for_each_test, error_on_warning

@ -877,44 +876,3 @@ def test_make_zmq_socket_ipv6():
 def test_make_zmq_path():
    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
-
-
-def test_get_tcp_uri():
-    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
-    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
-
-
-def test_split_host_port():
-    # valid ipv4
-    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
-    # invalid ipv4
-    with pytest.raises(ValueError):
-        # multi colon
-        assert split_host_port("127.0.0.1::5555")
-    with pytest.raises(ValueError):
-        # tailing colon
-        assert split_host_port("127.0.0.1:5555:")
-    with pytest.raises(ValueError):
-        # no colon
-        assert split_host_port("127.0.0.15555")
-    with pytest.raises(ValueError):
-        # none int port
-        assert split_host_port("127.0.0.1:5555a")
-
-    # valid ipv6
-    assert split_host_port("[::1]:5555") == ("::1", 5555)
-    # invalid ipv6
-    with pytest.raises(ValueError):
-        # multi colon
-        assert split_host_port("[::1]::5555")
-    with pytest.raises(IndexError):
-        # no colon
-        assert split_host_port("[::1]5555")
-    with pytest.raises(ValueError):
-        # none int port
-        assert split_host_port("[::1]:5555a")
-
-
-def test_join_host_port():
-    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
-    assert join_host_port("::1", 5555) == "[::1]:5555"
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501

 import json
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@ -9,7 +9,7 @@ import torch
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                         SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@ -17,7 +17,6 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
-from vllm.v1.structured_output.request import StructuredOutputRequest

 EOS_TOKEN_ID = 50256

@ -34,7 +33,6 @@ def create_scheduler(
    block_size: int = 16,
    max_model_len: Optional[int] = None,
    num_speculative_tokens: Optional[int] = None,
-    skip_tokenizer_init: bool = False,
 ) -> Scheduler:
    '''Create scheduler under test.

@ -67,7 +65,6 @@ def create_scheduler(
        trust_remote_code=True,
        dtype="float16",
        seed=42,
-        skip_tokenizer_init=skip_tokenizer_init,
    )
    # Cache config, optionally force APC
    kwargs_cache = ({} if enable_prefix_caching is None else {
@ -189,7 +186,7 @@ def test_get_num_unfinished_requests():
 ])
 def test_schedule(enable_prefix_caching: Optional[bool],
                  prompt_logprobs: Optional[int]):
-    '''Test scheduling.
+    '''Test scheduling. 
    Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
    '''
    scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
@ -1411,7 +1408,7 @@ def create_requests_with_priority(


 def test_priority_scheduling_basic_ordering():
-    """Test that requests are scheduled in priority order
+    """Test that requests are scheduled in priority order 
    (lower value = higher priority)."""
    scheduler = create_scheduler_with_priority()

@ -1440,7 +1437,7 @@ def test_priority_scheduling_basic_ordering():


 def test_priority_scheduling_arrival_time_tiebreaker():
-    """Test that arrival time is used
+    """Test that arrival time is used 
    as tiebreaker when priorities are equal."""
    scheduler = create_scheduler_with_priority()

@ -1498,7 +1495,7 @@ def test_priority_scheduling_mixed_priority_and_arrival():


 def test_priority_scheduling_preemption():
-    """Test that priority scheduling preempts
+    """Test that priority scheduling preempts 
    lower priority requests when memory is constrained."""
    # Create scheduler with very limited memory to force preemption
    scheduler = create_scheduler_with_priority(
@ -1579,7 +1576,7 @@ def test_priority_scheduling_preemption():


 def test_priority_scheduling_no_preemption_when_space_available():
-    """Test that preemption doesn't happen
+    """Test that preemption doesn't happen 
    when there's space for new requests."""
    scheduler = create_scheduler_with_priority(
        max_num_seqs=3,  # Allow 3 concurrent requests
@ -1629,7 +1626,7 @@ def test_priority_scheduling_no_preemption_when_space_available():


 def test_priority_scheduling_preemption_victim_selection():
-    """Test that the correct victim is selected for
+    """Test that the correct victim is selected for 
    preemption based on priority and arrival time."""
    # This test verifies the priority-based victim selection logic
    # by checking the waiting queue order after adding requests with different
@ -1746,7 +1743,7 @@ def test_priority_scheduling_waiting_queue_order():


 def test_priority_scheduling_fcfs_fallback():
-    """Test that FCFS behavior is maintained when all
+    """Test that FCFS behavior is maintained when all 
    requests have same priority."""
    scheduler = create_scheduler_with_priority()

@ -1814,7 +1811,7 @@ def test_priority_scheduling_with_limited_slots():


 def test_priority_scheduling_heap_property():
-    """Test that the waiting queue maintains heap
+    """Test that the waiting queue maintains heap 
    property for priority scheduling."""
    scheduler = create_scheduler_with_priority(
        max_num_seqs=1,  # Only one request can run at a time
@ -1860,39 +1857,3 @@ def test_priority_scheduling_heap_property():
    # Verify requests were scheduled in priority order (lowest value first)
    expected_priorities = sorted(priorities)
    assert scheduled_priorities == expected_priorities
-
-
-def test_schedule_skip_tokenizer_init():
-    scheduler = create_scheduler(skip_tokenizer_init=True)
-    requests = create_requests(num_requests=5)
-    for request in requests:
-        scheduler.add_request(request)
-    output = scheduler.schedule()
-    assert len(output.scheduled_new_reqs) == len(requests)
-    assert output.grammar_bitmask is None
-
-
-def test_schedule_skip_tokenizer_init_structured_output_request():
-    scheduler = create_scheduler(skip_tokenizer_init=True)
-    guided_params = GuidedDecodingParams(regex="[0-9]+")
-    sampling_params = SamplingParams(
-        ignore_eos=False,
-        max_tokens=16,
-        guided_decoding=guided_params,
-    )
-    request = Request(
-        request_id="0",
-        prompt_token_ids=[0, 1],
-        multi_modal_inputs=None,
-        multi_modal_hashes=None,
-        multi_modal_placeholders=None,
-        sampling_params=sampling_params,
-        pooling_params=None,
-        eos_token_id=EOS_TOKEN_ID,
-        structured_output_request=StructuredOutputRequest(sampling_params),
-    )
-    scheduler.add_request(request)
-    output = scheduler.schedule()
-    assert len(output.scheduled_new_reqs) == 0
-    assert len(scheduler.running) == 0
-    assert len(scheduler.waiting) == 1
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@ -1,30 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations

 import random
-from typing import TYPE_CHECKING, Optional
+from typing import Optional

 import pytest

-from vllm import LLM
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector

-if TYPE_CHECKING:
-    from tests.conftest import VllmRunner
-
 MODEL = "facebook/opt-125m"
 DTYPE = "half"


-def _vllm_model(
-    apc: bool,
-    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
-    *,
-    skip_tokenizer_init: bool = False,
-):
+def _vllm_model(apc: bool, vllm_runner, monkeypatch):
    """Set up VllmRunner instance."""
    monkeypatch.setenv("VLLM_USE_V1", "1")
    return vllm_runner(
@ -34,7 +23,6 @@ def _vllm_model(
        enforce_eager=True,
        enable_prefix_caching=apc,
        gpu_memory_utilization=0.5,
-        skip_tokenizer_init=skip_tokenizer_init,
    )


@ -57,27 +45,9 @@ def vllm_model_apc(vllm_runner, monkeypatch):
        yield vllm_model


-@pytest.fixture(
-    # Function scope decouples tests & allows
-    # env var adjustment via monkeypatch
-    scope="function",
-    # Prefix caching
-    params=[False, True])
-def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
-    """VllmRunner test fixture with APC."""
-    with _vllm_model(
-            request.param,
-            vllm_runner,
-            monkeypatch,
-            skip_tokenizer_init=True,
-    ) as vllm_model:
-        yield vllm_model
-
-
 def _get_test_sampling_params(
    prompt_list: list[str],
    seed: Optional[int] = 42,
-    structured_outputs: bool = False,
 ) -> tuple[list[SamplingParams], list[int]]:
    """Generate random sampling params for a batch."""

@ -92,34 +62,14 @@ def _get_test_sampling_params(
    n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
    # High temperature to maximize the chance of unique completions
    return [
-        SamplingParams(
-            temperature=0.95,
-            top_p=0.95,
-            n=n,
-            seed=seed,
-            guided_decoding=GuidedDecodingParams(
-                regex="[0-9]+") if structured_outputs else None,
-        ) for n in n_list
+        SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed)
+        for n in n_list
    ], n_list


-def test_compatibility_with_skip_tokenizer_init(
-    vllm_model_skip_tokenizer_init: VllmRunner,
-    example_prompts: list[str],
-):
-    # Case 1: Structured output request should raise an error.
-    sampling_params_list, _ = _get_test_sampling_params(
-        example_prompts,
-        structured_outputs=True,
-    )
-    model: LLM = vllm_model_skip_tokenizer_init.model
-    with pytest.raises(ValueError):
-        _ = model.generate(example_prompts, sampling_params_list)
-
-
 def test_parallel_sampling(vllm_model, example_prompts) -> None:
    """Test passes if parallel sampling `n>1` yields `n` unique completions.
-
+    
    Args:
      vllm_model: VllmRunner instance under test.
      example_prompt: test fixture providing prompts for testing.
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import random
 from collections.abc import Callable
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -13,6 +13,7 @@ UNSUPPORTED_MODELS_V1 = [
    "openai/whisper-large-v3",  # transcription
    "facebook/bart-large-cnn",  # encoder decoder
    "state-spaces/mamba-130m-hf",  # mamba1
+    "hmellor/tiny-random-BambaForCausalLM",  # hybrid
    "BAAI/bge-m3",  # embedding
 ]

--- a/tests/v1/test_request.py
+++ b/tests/v1/test_request.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.request import RequestStatus


--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import tempfile

--- a/tests/v1/tpu/test_tpu_qkv_linear.py
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile

 import numpy as np
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@ -450,7 +450,6 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):


 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} must come before the current layer"
@ -479,7 +478,6 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():


 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    invalid_layer = "model.layers.0.cross_attn.attn"
@ -508,7 +506,6 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():


 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} cannot be the same as the current layer"
@ -537,7 +534,6 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():


 def test_init_kv_cache_without_kv_sharing():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
@ -605,7 +601,6 @@ def test_init_kv_cache_without_kv_sharing():


 def test_init_kv_cache_with_kv_sharing_valid():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import sys

--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@ -2,146 +2,51 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import sys
-from enum import Enum

-
-class SPDXStatus(Enum):
-    """SPDX header status enumeration"""
-    EMPTY = "empty"  # empty __init__.py
-    COMPLETE = "complete"
-    MISSING_LICENSE = "missing_license"  # Only has copyright line
-    MISSING_COPYRIGHT = "missing_copyright"  # Only has license line
-    MISSING_BOTH = "missing_both"  # Completely missing
-
-
-FULL_SPDX_HEADER = (
+SPDX_HEADER = (
    "# SPDX-License-Identifier: Apache-2.0\n"
    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
-
-LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0"
-COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"  # noqa: E501
+SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"


-def check_spdx_header_status(file_path):
-    """Check SPDX header status of the file"""
-    with open(file_path, encoding="UTF-8") as file:
+def check_spdx_header(file_path):
+    with open(file_path, encoding='UTF-8') as file:
        lines = file.readlines()
        if not lines:
-            # Empty file
-            return SPDXStatus.EMPTY
-
-        # Skip shebang line
-        start_idx = 0
-        if lines and lines[0].startswith("#!"):
-            start_idx = 1
-
-        has_license = False
-        has_copyright = False
-
-        # Check all lines for SPDX headers (not just the first two)
-        for i in range(start_idx, len(lines)):
-            line = lines[i].strip()
-            if line == LICENSE_LINE:
-                has_license = True
-            elif line == COPYRIGHT_LINE:
-                has_copyright = True
-
-        # Determine status based on what we found
-        if has_license and has_copyright:
-            return SPDXStatus.COMPLETE
-        elif has_license and not has_copyright:
-            # Only has license line
-            return SPDXStatus.MISSING_COPYRIGHT
-            # Only has copyright line
-        elif not has_license and has_copyright:
-            return SPDXStatus.MISSING_LICENSE
-        else:
-            # Completely missing both lines
-            return SPDXStatus.MISSING_BOTH
+            # Empty file like __init__.py
+            return True
+        for line in lines:
+            if line.strip().startswith(SPDX_HEADER_PREFIX):
+                return True
+    return False


-def add_header(file_path, status):
-    """Add or supplement SPDX header based on status"""
-    with open(file_path, "r+", encoding="UTF-8") as file:
+def add_header(file_path):
+    with open(file_path, 'r+', encoding='UTF-8') as file:
        lines = file.readlines()
        file.seek(0, 0)
-        file.truncate()
-
-        if status == SPDXStatus.MISSING_BOTH:
-            # Completely missing, add complete header
-            if lines and lines[0].startswith("#!"):
-                # Preserve shebang line
-                file.write(lines[0])
-                file.write(FULL_SPDX_HEADER + "\n")
-                file.writelines(lines[1:])
-            else:
-                # Add header directly
-                file.write(FULL_SPDX_HEADER + "\n")
-                file.writelines(lines)
-
-        elif status == SPDXStatus.MISSING_COPYRIGHT:
-            # Only has license line, need to add copyright line
-            # Find the license line and add copyright line after it
-            for i, line in enumerate(lines):
-                if line.strip() == LICENSE_LINE:
-                    # Insert copyright line after license line
-                    lines.insert(
-                        i + 1,
-                        f"{COPYRIGHT_LINE}\n",
-                    )
-                    break
-
-            file.writelines(lines)
-
-        elif status == SPDXStatus.MISSING_LICENSE:
-            # Only has copyright line, need to add license line
-            # Find the copyright line and add license line before it
-            for i, line in enumerate(lines):
-                if line.strip() == COPYRIGHT_LINE:
-                    # Insert license line before copyright line
-                    lines.insert(i, f"{LICENSE_LINE}\n")
-                    break
+        if lines and lines[0].startswith("#!"):
+            file.write(lines[0])
+            file.write(SPDX_HEADER + '\n')
+            file.writelines(lines[1:])
+        else:
+            file.write(SPDX_HEADER + '\n')
            file.writelines(lines)


 def main():
-    """Main function"""
-    files_missing_both = []
-    files_missing_copyright = []
-    files_missing_license = []
-
+    files_with_missing_header = []
    for file_path in sys.argv[1:]:
-        status = check_spdx_header_status(file_path)
+        if not check_spdx_header(file_path):
+            files_with_missing_header.append(file_path)

-        if status == SPDXStatus.MISSING_BOTH:
-            files_missing_both.append(file_path)
-        elif status == SPDXStatus.MISSING_COPYRIGHT:
-            files_missing_copyright.append(file_path)
-        elif status == SPDXStatus.MISSING_LICENSE:
-            files_missing_license.append(file_path)
-        else:
-            continue
-
-    # Collect all files that need fixing
-    all_files_to_fix = (files_missing_both + files_missing_copyright +
-                        files_missing_license)
-    if all_files_to_fix:
+    if files_with_missing_header:
        print("The following files are missing the SPDX header:")
-        if files_missing_both:
-            for file_path in files_missing_both:
-                print(f"  {file_path}")
-                add_header(file_path, SPDXStatus.MISSING_BOTH)
+        for file_path in files_with_missing_header:
+            print(f"  {file_path}")
+            add_header(file_path)

-        if files_missing_copyright:
-            for file_path in files_missing_copyright:
-                print(f"  {file_path}")
-                add_header(file_path, SPDXStatus.MISSING_COPYRIGHT)
-        if files_missing_license:
-            for file_path in files_missing_license:
-                print(f"  {file_path}")
-                add_header(file_path, SPDXStatus.MISSING_LICENSE)
-
-    sys.exit(1 if all_files_to_fix else 0)
+    sys.exit(1 if files_with_missing_header else 0)


 if __name__ == "__main__":
--- a/tools/pd_disagg/Justfile
+++ b/tools/pd_disagg/Justfile
@ -0,0 +1,88 @@
+# Needed for the proxy server
+vllm-directory := "/home/rshaw/vllm/" 
+
+PREFILL_GPU := "0,1,2,3"
+DECODE_GPU := "4,5,6,7"
+
+PREFILL_TP := env("PREFILL_TP", "1")
+DECODE_TP := env("DECODE_TP", "1")
+
+BLOCK_SIZE := env("BLOCK_SIZE", "128")
+
+MODEL := "meta-llama/Llama-3.1-8B-Instruct"
+PROXY_PORT := "8192"
+PREFILL_PORT := "8100"
+DECODE_PORT := "8200"
+PREFILL_NIXL_SIDE_CHANNEL_PORT := "5557"
+DECODE_NIXL_SIDE_CHANNEL_PORT := "5568"
+
+prefill:
+    VLLM_NIXL_SIDE_CHANNEL_PORT={{PREFILL_NIXL_SIDE_CHANNEL_PORT}} \
+    CUDA_VISIBLE_DEVICES={{PREFILL_GPU}} \
+    vllm serve {{MODEL}} \
+      --port {{PREFILL_PORT}} \
+      --tensor-parallel-size {{PREFILL_TP}} \
+      --enforce-eager \
+      --disable-log-requests \
+      --block-size {{BLOCK_SIZE}} \
+      --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+
+decode:
+    VLLM_NIXL_SIDE_CHANNEL_PORT={{DECODE_NIXL_SIDE_CHANNEL_PORT}} \
+    CUDA_VISIBLE_DEVICES={{DECODE_GPU}} \
+    vllm serve {{MODEL}} \
+      --port {{DECODE_PORT}} \
+      --tensor-parallel-size {{DECODE_TP}} \
+      --enforce-eager \
+      --disable-log-requests \
+      --block-size {{BLOCK_SIZE}} \
+      --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+
+proxy:
+    python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
+      --port {{PROXY_PORT}} \
+      --prefiller-port {{PREFILL_PORT}} \
+      --decoder-port {{DECODE_PORT}}
+
+send_request:
+  curl -X POST http://localhost:{{PROXY_PORT}}/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{ \
+      "model": "{{MODEL}}", \
+      "prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
+      "max_tokens": 150, \
+      "temperature": 0.7 \
+    }'
+
+benchmark NUM_PROMPTS:
+  python {{vllm-directory}}/benchmarks/benchmark_serving.py \
+    --port {{PROXY_PORT}} \
+    --model {{MODEL}} \
+    --dataset-name random \
+    --random-input-len 30000 \
+    --random-output-len 10 \
+    --num-prompts {{NUM_PROMPTS}} \
+    --seed $(date +%s) \
+
+benchmark_one INPUT_LEN:
+  python {{vllm-directory}}benchmarks/benchmark_one_concurrent.py \
+    --port {{PROXY_PORT}} \
+    --model {{MODEL}} \
+    --input-len {{INPUT_LEN}} \
+    --output-len 1 \
+    --num-requests 10 \
+    --seed $(date +%s)
+
+benchmark_one_no_pd INPUT_LEN:
+  python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
+    --port {{DECODE_PORT}} \
+    --model {{MODEL}} \
+    --input-len {{INPUT_LEN}} \
+    --output-len 1 \
+    --num-requests 10 \
+    --seed $(date +%s)
+
+eval:
+  lm_eval --model local-completions --tasks gsm8k \
+    --model_args model={{MODEL}},base_url=http://127.0.0.1:{{PROXY_PORT}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
+    --limit 1000
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -646,20 +646,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
    return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)


-def cutlass_blockwise_scaled_grouped_mm(
-    output: torch.Tensor,
-    a: torch.Tensor,
-    b: torch.Tensor,
-    scales_a: torch.Tensor,
-    scales_b: torch.Tensor,
-    problem_sizes: torch.Tensor,
-    expert_offsets: torch.Tensor,
-):
-    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a,
-                                                     scales_b, problem_sizes,
-                                                     expert_offsets)
-
-
 def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
                          block_scale_a: torch.Tensor,
                          block_scale_b: torch.Tensor, alpha: torch.Tensor,
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -310,8 +310,7 @@ class MultiHeadAttention(nn.Module):
            # currently, only torch_sdpa is supported on rocm
            self.attn_backend = _Backend.TORCH_SDPA
        else:
-            if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
-                           _Backend.FLEX_ATTENTION):
+            if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
                backend = _Backend.XFORMERS

            self.attn_backend = backend if backend in {
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@ -4,7 +4,7 @@
 import os
 from contextlib import contextmanager
 from functools import cache
-from typing import Generator, Optional, Union
+from typing import Generator, Optional, Type

 import torch

@ -79,33 +79,6 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
    return forced_attn_backend


-def supports_head_size(
-    attn_backend: Union[str, type[AttentionBackend]],
-    head_size: int,
-) -> bool:
-    if isinstance(attn_backend, str):
-        try:
-            attn_backend = resolve_obj_by_qualname(attn_backend)
-        except ImportError:
-            return False
-
-    assert isinstance(attn_backend, type)
-
-    # TODO: Update the interface once V0 is removed
-    if get_supported_head_sizes := getattr(attn_backend,
-                                           "get_supported_head_sizes", None):
-        return head_size in get_supported_head_sizes()
-    if validate_head_size := getattr(attn_backend, "validate_head_size", None):
-        try:
-            validate_head_size(head_size)
-            return True
-        except Exception:
-            return False
-
-    raise NotImplementedError(f"{attn_backend.__name__} does not support "
-                              "head size validation")
-
-
 def get_attn_backend(
    head_size: int,
    dtype: torch.dtype,
@ -114,7 +87,7 @@ def get_attn_backend(
    is_attention_free: bool,
    is_blocksparse: bool = False,
    use_mla: bool = False,
-) -> type[AttentionBackend]:
+) -> Type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""
    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
    # value to be returned from the cache if the value changes between calls.
@ -142,7 +115,7 @@ def _cached_get_attn_backend(
    is_blocksparse: bool = False,
    use_v1: bool = False,
    use_mla: bool = False,
-) -> type[AttentionBackend]:
+) -> Type[AttentionBackend]:
    if is_blocksparse:
        logger.info("Using BlocksparseFlashAttention backend.")
        from vllm.attention.backends.blocksparse_attn import (
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import torch
 import torch._inductor.pattern_matcher as pm
--- a/vllm/config.py
+++ b/vllm/config.py
@ -466,9 +466,6 @@ class ModelConfig:
                    "affect the random state of the Python process that "
                    "launched vLLM.", self.seed)

-        # Keep set served_model_name before maybe_model_redirect(self.model)
-        self.served_model_name = get_served_model_name(self.model,
-                                                       self.served_model_name)
        self.model = maybe_model_redirect(self.model)
        # The tokenizer is consistent with the model by default.
        if self.tokenizer is None:
@ -612,6 +609,8 @@ class ModelConfig:

        self.original_max_model_len = self.max_model_len
        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
+        self.served_model_name = get_served_model_name(self.model,
+                                                       self.served_model_name)
        self.multimodal_config = self._init_multimodal_config()
        if not self.skip_tokenizer_init:
            self._verify_tokenizer_mode()
@ -1421,7 +1420,7 @@ class ModelConfig:

    @property
    def is_cross_encoder(self) -> bool:
-        return self.task == "classify"
+        return self.registry.is_cross_encoder_model(self.architectures)

    @property
    def use_mla(self) -> bool:
@ -2319,7 +2318,7 @@ class SchedulerConfig:

        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
            logger.warning(
-                "max_num_batched_tokens (%d) exceeds max_num_seqs "
+                "max_num_batched_tokens (%d) exceeds max_num_seqs"
                "* max_model_len (%d). This may lead to unexpected behavior.",
                self.max_num_batched_tokens,
                self.max_num_seqs * self.max_model_len)
@ -4763,12 +4762,6 @@ class VllmConfig:
        if cls is not None:
            cls.verify_and_update_config(self)

-        if self.model_config.task == "classify":
-            # Maybe convert ForCausalLM into ForSequenceClassification model.
-            from vllm.model_executor.models.adapters import (
-                SequenceClassificationConfig)
-            SequenceClassificationConfig.verify_and_update_config(self)
-
    def __str__(self):
        return (
            f"model={self.model_config.model!r},"
--- a/vllm/distributed/eplb/init.py
+++ b/vllm/distributed/eplb/init.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''
 Expert parallelism load balancer (EPLB).
 '''
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) metrics and states.

--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) for vLLM.

--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 The actual execution of the rearrangement.

--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@ -97,10 +97,10 @@ def get_kv_connector_cache_layout():
    # used for faster transfer.
    vllm_config = get_current_vllm_config()
    kv_config = vllm_config.kv_transfer_config
-    if kv_config is not None and vllm_config.model_config is None:
+    if vllm_config.model_config is None or kv_config is None:
        logger.warning_once("Unable to detect current VLLM config. " \
        "Defaulting to NHD kv cache layout.")
-    elif kv_config is not None:
+    else:
        use_mla = vllm_config.model_config.use_mla
        if not use_mla and kv_config.kv_connector == "NixlConnector":
            logger.info_once("NixlConnector detected. Setting KV cache " \
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import logging
 import os
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import atexit
 import ctypes
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@ -16,7 +16,6 @@ from safetensors.torch import save as safetensors_save
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
-from vllm.utils import join_host_port, make_zmq_path, split_host_port

 logger = init_logger(__name__)
 NONE_INT = -150886311
@ -80,19 +79,18 @@ class MooncakeTransferEngine:
            logger.error(
                "An error occurred while loading the configuration: %s", exc)
            raise
-        prefill_host, base_prefill_port = split_host_port(
-            self.config.prefill_url)
-        decode_host, base_decode_port = split_host_port(self.config.decode_url)
+        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
+        decode_host, base_decode_port = self.config.decode_url.split(':')

        # Avoid ports conflict when running prefill and decode on the same node
        if prefill_host == decode_host and \
                base_prefill_port == base_decode_port:
-            base_decode_port = base_decode_port + 100
+            base_decode_port = str(int(base_decode_port) + 100)

-        prefill_port = base_prefill_port + self.local_rank
-        decode_port = base_decode_port + self.local_rank
-        self.prefill_url = join_host_port(prefill_host, prefill_port)
-        self.decode_url = join_host_port(decode_host, decode_port)
+        prefill_port = int(base_prefill_port) + self.local_rank
+        decode_port = int(base_decode_port) + self.local_rank
+        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
+        self.decode_url = ':'.join([decode_host, str(decode_port)])

        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
                        self.config.metadata_server, self.config.protocol,
@ -112,30 +110,22 @@ class MooncakeTransferEngine:
        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
                                     decode_host, base_decode_port)

-    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: int,
-                                d_host: str, d_port: int) -> None:
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
+                                d_host: str, d_port: str) -> None:
        """Set up ZeroMQ sockets for sending and receiving data."""
        # Offsets < 8 are left for initialization in case tp and pp are enabled
-        p_rank_offset = p_port + 8 + self.local_rank * 2
-        d_rank_offset = d_port + 8 + self.local_rank * 2
+        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
+        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
        if kv_rank == 0:
-            self.sender_socket.bind(
-                make_zmq_path("tcp", p_host, p_rank_offset + 1))
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", d_host, d_rank_offset + 1))
-            self.sender_ack.connect(
-                make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.receiver_ack.bind(
-                make_zmq_path("tcp", p_host, p_rank_offset + 2))
+            self.sender_socket.bind(f"tcp://{p_host}:{p_rank_offset + 1}")
+            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
+            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
+            self.receiver_ack.bind(f"tcp://{p_host}:{p_rank_offset + 2}")
        else:
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", p_host, p_rank_offset + 1))
-            self.sender_socket.bind(
-                make_zmq_path("tcp", d_host, d_rank_offset + 1))
-            self.receiver_ack.bind(
-                make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.sender_ack.connect(
-                make_zmq_path("tcp", p_host, p_rank_offset + 2))
+            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
+            self.sender_socket.bind(f"tcp://{d_host}:{d_rank_offset + 1}")
+            self.receiver_ack.bind(f"tcp://{d_host}:{d_rank_offset + 2}")
+            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")

    def initialize(self, local_hostname: str, metadata_server: str,
                   protocol: str, device_name: str,
--- a/vllm/distributed/tpu_distributed_utils.py
+++ b/vllm/distributed/tpu_distributed_utils.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import OrderedDict
 from typing import Optional

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1393,6 +1393,13 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False

+        # Only Fp16 and Bf16 dtypes since we only support FA.
+        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
+        if model_config.dtype not in V1_SUPPORTED_DTYPES:
+            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
+                               recommend_to_remove=False)
+            return False
+
        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -28,8 +28,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
                               ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
    InputAudio)
-from PIL import Image
-from pydantic import BaseModel, ConfigDict, TypeAdapter
+from pydantic import TypeAdapter
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                          ProcessorMixin)
@ -92,25 +91,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
    """The type of the content part."""


-class PILImage(BaseModel):
-    """
-    A PIL.Image.Image object.
-    """
-    image_pil: Image.Image
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-
-class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
-    """A simpler version of the param that only accepts a PIL image.
-
-    Example:
-    {
-        "image_pil": ImageAsset('cherry_blossom').pil_image
-    }
-    """
-    image_pil: Required[PILImage]
-
-
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.
@ -149,7 +129,6 @@ ChatCompletionContentPartParam: TypeAlias = Union[
    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
    ChatCompletionContentPartInputAudioParam,
    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
-    CustomChatCompletionContentPILImageParam,
    CustomChatCompletionContentSimpleImageParam,
    ChatCompletionContentPartImageEmbedsParam,
    CustomChatCompletionContentSimpleAudioParam,
@ -652,10 +631,6 @@ class BaseMultiModalContentParser(ABC):
                           image_embeds: Union[str, dict[str, str]]) -> None:
        raise NotImplementedError

-    @abstractmethod
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
-        raise NotImplementedError
-
    @abstractmethod
    def parse_audio(self, audio_url: str) -> None:
        raise NotImplementedError
@ -702,10 +677,6 @@ class MultiModalContentParser(BaseMultiModalContentParser):

        self._add_placeholder(placeholder)

-    def parse_image_pil(self, image_pil: Image.Image) -> None:
-        placeholder = self._tracker.add("image", image_pil)
-        self._add_placeholder(placeholder)
-
    def parse_audio(self, audio_url: str) -> None:
        audio = self._connector.fetch_audio(audio_url)

@ -762,13 +733,6 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
        placeholder = self._tracker.add("image_embeds", future)
        self._add_placeholder(placeholder)

-    def parse_image_pil(self, image_pil: Image.Image) -> None:
-        future: asyncio.Future[Image.Image] = asyncio.Future()
-        future.set_result(image_pil)
-
-        placeholder = self._tracker.add("image", future)
-        self._add_placeholder(placeholder)
-
    def parse_audio(self, audio_url: str) -> None:
        audio_coro = self._connector.fetch_audio_async(audio_url)

@ -887,13 +851,12 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
-_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
 # Need to validate url objects
 _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python

-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]

 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@ -906,7 +869,6 @@ MM_PARSER_MAP: dict[
    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
    "image_embeds":
    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
-    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
    "audio_url":
    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
    "input_audio":
@ -976,7 +938,7 @@ def _parse_chat_message_content_mm_part(


 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds", "image_pil",
+                                       "image_embeds",
                                       "audio_url", "input_audio", "video_url")


@ -1047,10 +1009,6 @@ def _parse_chat_message_content_part(
        else:
            return str_content

-    if part_type == "image_pil":
-        image_content = cast(Image.Image, content)
-        mm_parser.parse_image_pil(image_content)
-        return {'type': 'image'} if wrap_dicts else None
    if part_type == "image_url":
        str_content = cast(str, content)
        mm_parser.parse_image(str_content)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -1204,7 +1204,7 @@ class LLM:

        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]

-        pooling_params = PoolingParams(use_cross_encoder=True)
+        pooling_params = PoolingParams()

        tokenization_kwargs: dict[str, Any] = {}
        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -910,8 +910,6 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 }

 if envs.VLLM_SERVER_DEV_MODE:
-    logger.warning("SECURITY WARNING: Development endpoints are enabled! "
-                   "This should NOT be used in production!")

    @router.get("/server_info")
    async def show_server_info(raw_request: Request):
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -229,6 +229,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
    logit_bias: Optional[dict[str, float]] = None
    logprobs: Optional[bool] = False
    top_logprobs: Optional[int] = 0
+    # TODO(#9845): remove max_tokens when field is removed from OpenAI API
    max_tokens: Optional[int] = Field(
        default=None,
        deprecated=
@ -432,10 +433,23 @@ class ChatCompletionRequest(OpenAIBaseModel):
    }

    def to_beam_search_params(
-            self, max_tokens: int,
-            default_sampling_params: dict) -> BeamSearchParams:
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None
+    ) -> BeamSearchParams:
+        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+        max_tokens = self.max_completion_tokens or self.max_tokens

+        if default_sampling_params is None:
+            default_sampling_params = {}
        n = self.n if self.n is not None else 1
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
@ -451,10 +465,21 @@ class ChatCompletionRequest(OpenAIBaseModel):

    def to_sampling_params(
        self,
-        max_tokens: int,
+        default_max_tokens: int,
        logits_processor_pattern: Optional[str],
-        default_sampling_params: dict,
+        default_sampling_params: Optional[dict] = None,
    ) -> SamplingParams:
+        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+        max_tokens = self.max_completion_tokens or self.max_tokens
+
+        if default_sampling_params is None:
+            default_sampling_params = {}
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)

        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
@ -873,15 +898,22 @@ class CompletionRequest(OpenAIBaseModel):
    }

    def to_beam_search_params(
-        self,
-        max_tokens: int,
-        default_sampling_params: Optional[dict] = None,
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None
    ) -> BeamSearchParams:
+        max_tokens = self.max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}
        n = self.n if self.n is not None else 1

+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get("temperature", 1.0)

@ -896,14 +928,21 @@ class CompletionRequest(OpenAIBaseModel):

    def to_sampling_params(
        self,
-        max_tokens: int,
+        default_max_tokens: int,
        logits_processor_pattern: Optional[str],
        default_sampling_params: Optional[dict] = None,
    ) -> SamplingParams:
+        max_tokens = self.max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}

+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
@ -1156,9 +1195,8 @@ class ScoreRequest(OpenAIBaseModel):

    # --8<-- [end:score-extra-params]

-    def to_pooling_params(self, *, use_cross_encoder: bool = False):
-        return PoolingParams(use_cross_encoder=use_cross_encoder,
-                             additional_data=self.additional_data)
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)


 class RerankRequest(OpenAIBaseModel):
@ -1183,9 +1221,8 @@ class RerankRequest(OpenAIBaseModel):

    # --8<-- [end:rerank-extra-params]

-    def to_pooling_params(self, *, use_cross_encoder: bool = False):
-        return PoolingParams(use_cross_encoder=use_cross_encoder,
-                             additional_data=self.additional_data)
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)


 class RerankDocument(BaseModel):
@ -1776,7 +1813,7 @@ class TranscriptionRequest(OpenAIBaseModel):
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-
+        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
        max_tokens = default_max_tokens

        if default_sampling_params is None:
@ -1992,7 +2029,7 @@ class TranslationRequest(OpenAIBaseModel):
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-
+        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
        max_tokens = default_max_tokens

        if default_sampling_params is None:
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -34,7 +34,6 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
    MistralToolCall)
-from vllm.entrypoints.utils import get_max_tokens
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@ -234,22 +233,15 @@ class OpenAIServingChat(OpenAIServing):
        try:
            for i, engine_prompt in enumerate(engine_prompts):
                sampling_params: Union[SamplingParams, BeamSearchParams]
-
-                if self.default_sampling_params is None:
-                    self.default_sampling_params = {}
-
-                max_tokens = get_max_tokens(
-                    max_model_len=self.max_model_len,
-                    request=request,
-                    input_length=len(engine_prompt["prompt_token_ids"]),
-                    default_sampling_params=self.default_sampling_params)
-
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
-                        max_tokens, self.default_sampling_params)
+                        default_max_tokens, self.default_sampling_params)
                else:
                    sampling_params = request.to_sampling_params(
-                        max_tokens, self.model_config.logits_processor_pattern,
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params)

                self._log_inputs(request_id,
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -33,7 +33,6 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                    is_text_tokens_prompt)
 # yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
                              is_tokens_prompt)
 from vllm.logger import init_logger
@ -161,22 +160,15 @@ class OpenAIServingCompletion(OpenAIServing):
                    input_length = len(engine_prompt["prompt_token_ids"])
                else:
                    assert_never(engine_prompt)
-
-                if self.default_sampling_params is None:
-                    self.default_sampling_params = {}
-
-                max_tokens = get_max_tokens(
-                    max_model_len=self.max_model_len,
-                    request=request,
-                    input_length=input_length,
-                    default_sampling_params=self.default_sampling_params)
+                default_max_tokens = self.max_model_len - input_length

                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
-                        max_tokens, self.default_sampling_params)
+                        default_max_tokens, self.default_sampling_params)
                else:
                    sampling_params = request.to_sampling_params(
-                        max_tokens, self.model_config.logits_processor_pattern,
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params)

                request_id_item = f"{request_id}-{i}"
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@ -25,7 +25,9 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
+                                               PreTrainedTokenizer,
+                                               PreTrainedTokenizerFast)
 from vllm.utils import make_async, merge_async_iterators

 logger = init_logger(__name__)
@ -48,7 +50,7 @@ class ServingScores(OpenAIServing):

    async def _embedding_score(
        self,
-        tokenizer: AnyTokenizer,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@ -139,7 +141,7 @@ class ServingScores(OpenAIServing):

    async def _cross_encoding_score(
        self,
-        tokenizer: AnyTokenizer,
+        tokenizer: Union[AnyTokenizer],
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@ -188,7 +190,7 @@ class ServingScores(OpenAIServing):
        # Schedule the request and get the result generator.
        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []

-        pooling_params = request.to_pooling_params(use_cross_encoder=True)
+        pooling_params = request.to_pooling_params()

        for i, engine_prompt in enumerate(engine_prompts):
            request_id_item = f"{request_id}-{i}"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Robert Shaw	7d092fc32c	revert skip-merge-desc Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-03 20:30:45 +00:00
Robert Shaw	1a6c27f271	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-03 20:29:33 +00:00
Robert Shaw	3c6fd286b4	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-03 18:29:58 +00:00