Revert "[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 )"

This reverts commit e202dd2736bc575b11250b15311512d19d3225d5.
[Bugfix] Add use_cross_encoder flag to use correct activation in ClassifierPooler (#20527 )
2025-10-21 07:13:52 +08:00 · 2025-07-06 14:02:36 -07:00 · 2025-07-06 14:01:48 -07:00 · 2025-07-06 19:44:52 +00:00 · 2025-07-06 09:54:36 -07:00 · 2025-07-06 08:48:13 -07:00
181 changed files with 4116 additions and 1499 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -52,7 +52,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Annotate release workflow"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -107,10 +107,9 @@ fi

 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
-  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_blocksparse_attention.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
-  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -68,7 +68,7 @@ jobs:
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"

      - name: curl test
        run: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -259,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -615,6 +615,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "in CUDA target architectures.")
    endif()
  endif()
+  
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+                     "if you intend on running FP8 quantized MoE models on Blackwell.")
+    else()
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()

  #
  # Machete kernels
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
            4096,
        ]
    else:
-        batch_sizes = [args.batch_size]
+        batch_sizes = args.batch_size

    use_deep_gemm = bool(args.use_deep_gemm)

@ -728,7 +728,7 @@ if __name__ == "__main__":
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--trust-remote-code", action="store_true")
    parser.add_argument("--model-prefix", type=str, required=False)
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@ -45,7 +45,6 @@
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"

 #include "cutlass_extensions/gemm/dispatch_policy.hpp"
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -162,10 +162,11 @@ __global__ void dynamic_scaled_int8_quant_kernel(

  // calculate for absmax
  float thread_max = 0.f;
-  for (int i = tid; i < hidden_size; i += stride) {
-    const auto v = fabsf(static_cast<float>(row_in[i]));
-    thread_max = fmaxf(thread_max, v);
-  }
+  vectorize_read_with_alignment<16>(
+      row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
+        const float v = fabsf(static_cast<float>(src));
+        thread_max = fmaxf(thread_max, v);
+      });
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
@ -232,9 +233,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(

  // 1. calculate min & max
  MinMax thread_mm;
-  for (int i = tid; i < hidden_size; i += stride) {
-    thread_mm += static_cast<float>(row_in[i]);
-  }
+  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
+                                    [&] __device__(const scalar_t& src) {
+                                      thread_mm += static_cast<float>(src);
+                                    });

  using BlockReduce = cub::BlockReduce<MinMax, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@ -51,7 +51,8 @@ struct cutlass_3x_gemm {
  // These are the minimum alignments needed for the kernels to compile
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD = 4;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;

  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@ -0,0 +1,374 @@
+#include "core/registration.h"
+
+#include <torch/all.h>
+#include <cutlass/arch/arch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include <cassert>
+
+using namespace cute;
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator,
+          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+__global__ void get_ggemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
+    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scale_base_as_int,
+    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
+  int expert_id = threadIdx.x;
+
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+
+  int m = problem_sizes[expert_id * 3];
+  int n = problem_sizes[expert_id * 3 + 1];
+  int k = problem_sizes[expert_id * 3 + 2];
+
+  int32_t expert_offset = expert_offsets[expert_id];
+  int a_stride = expert_offset * k;
+  int b_stride = expert_id * k * n;
+  int a_scale_stride = expert_offset * k / 128;
+  int b_scale_stride = expert_id * k * n / 128 / 128;
+
+  a_offsets[expert_id] = a_base_as_int + a_stride;
+  b_offsets[expert_id] = b_base_as_int + b_stride;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
+  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  *layout_sfa_ptr =
+      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
+  *layout_sfb_ptr =
+      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
+                                 ScaleConfig)                                 \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
+    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
+                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
+        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
+        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
+        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
+        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
+        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
+        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
+        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
+        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
+        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
+        static_cast<float*>(a_scales.data_ptr()),                             \
+        static_cast<float*>(b_scales.data_ptr()),                             \
+        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
+        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
+        static_cast<int*>(problem_sizes.data_ptr()));                         \
+  }
+
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_ggemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
+    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
+  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
+
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+}
+
+template <typename OutType, typename ScheduleConfig, typename LayoutD>
+void run_blockwise_scaled_group_mm(
+    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
+    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
+    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+
+  // Types
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = LayoutD;
+
+  // Alignments
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
+          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA,
+          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
+          AlignmentA, ElementB,
+          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
+          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = (int)expert_offsets.size(0);
+
+  Gemm gemm_op;
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementA**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(stride_a.data_ptr()),
+      static_cast<const ElementB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(stride_b.data_ptr()),
+      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
+          layout_sfa.data_ptr()),
+      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
+          layout_sfb.data_ptr())};
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = a_ptrs.get_device();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(stride_c.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(stride_c.data_ptr())};
+
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info};
+
+  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM");
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void blockwise_scaled_group_mm_dispatch_shape(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  struct MmaConfig {
+    using ElementA = cutlass::float_e4m3_t;
+    using KernelSchedule =
+        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
+        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+    using LayoutC = cutlass::layout::RowMajor;
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using ClusterShape = Shape<_1, _1, _1>;
+  };
+
+  int num_experts = (int)expert_offsets.size(0);
+
+  auto a_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto out_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto a_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  auto layout_sfa = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+  auto layout_sfb = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+
+  auto stride_a = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_b = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_c = torch::full(
+      {num_experts}, output.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  torch::TensorOptions options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
+                       typename MmaConfig::LayoutSFB,
+                       typename MmaConfig::ScaleConfig>(
+      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
+      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
+
+  run_blockwise_scaled_group_mm<OutType, MmaConfig,
+                                typename MmaConfig::LayoutC>(
+      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
+      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
+      expert_offsets);
+}
+
+void cutlass_blockwise_scaled_grouped_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
+              "a must be kFloat8_e4m3fn");
+  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
+              "b must be kFloat8_e4m3fn");
+  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
+                  output.scalar_type() == torch::kHalf,
+              "output must be bfloat16 or half");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
+              "scales_a must be float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
+              "scales_b must be float32");
+  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
+              "expert_offsets must be int32");
+
+  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
+  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
+  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
+  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
+  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
+
+#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
+  if (output.scalar_type() == torch::kBFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else if (output.scalar_type() == torch::kFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+#endif
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_blockwise_scaled_grouped_mm",
+         &cutlass_blockwise_scaled_grouped_mm);
+}
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@ -38,7 +38,6 @@
 #include "cute/atom/mma_atom.hpp"
 #include "cute/atom/copy_traits_sm90_tma.hpp"
 #include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 #include "cutlass/pipeline/pipeline.hpp"
 #include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
@ -27,6 +27,26 @@ __device__ inline void vectorize_with_alignment(
  constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
  uintptr_t addr = reinterpret_cast<uintptr_t>(in);

+  // fast path when the whole region is already aligned
+  // Note: currently the output is guaranteed to be same as the input, so we
+  // don't check it here, comments here just for future reference.
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    using vout_t = vec_n_t<OutT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+    auto* v_out = reinterpret_cast<vout_t*>(out);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vout_t tmp;
+      vec_op(tmp, v_in[i]);
+      v_out[i] = tmp;
+    }
+    return;
+  }
+
  int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
  int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
  int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
@ -72,4 +92,81 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in,
                                     std::forward<ScaOp>(scalar_op));
 }

+template <int VEC_SIZE, typename InT, typename ScaOp>
+struct DefaultReadVecOp {
+  ScaOp scalar_op;
+
+  __device__ __forceinline__ void operator()(
+      const vec_n_t<InT, VEC_SIZE>& src) const {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      scalar_op(src.val[i]);
+    }
+  }
+};
+
+// read-only version: iterate over the input with alignment guarantees
+template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
+__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
+                                                     int tid, int stride,
+                                                     VecOp&& vec_op,
+                                                     ScaOp&& scalar_op) {
+  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
+                "VEC_SIZE must be a positive power-of-two");
+  constexpr int WIDTH = VEC_SIZE * sizeof(InT);
+  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
+
+  // fast path when the whole region is already aligned
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vec_op(v_in[i]);
+    }
+    return;
+  }
+
+  int misalignment_offset = addr & (WIDTH - 1);
+  int alignment_bytes = WIDTH - misalignment_offset;
+  int prefix_elems = alignment_bytes & (WIDTH - 1);
+  prefix_elems /= sizeof(InT);
+  prefix_elems = min(prefix_elems, len);
+
+  // 1. handle the possibly unaligned prefix with scalar access.
+  for (int i = tid; i < prefix_elems; i += stride) {
+    scalar_op(in[i]);
+  }
+
+  in += prefix_elems;
+  len -= prefix_elems;
+
+  int num_vec = len / VEC_SIZE;
+  using vin_t = vec_n_t<InT, VEC_SIZE>;
+  auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+  // 2. vectorized traversal of the main aligned region.
+  for (int i = tid; i < num_vec; i += stride) {
+    vec_op(v_in[i]);
+  }
+
+  // 3. handle remaining tail elements.
+  int tail_start = num_vec * VEC_SIZE;
+  for (int i = tid + tail_start; i < len; i += stride) {
+    scalar_op(in[i]);
+  }
+}
+
+// overload that requires only a scalar_op
+template <int VEC_SIZE, typename InT, typename ScaOp>
+__device__ __forceinline__ void vectorize_read_with_alignment(
+    const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
+  using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
+  vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
+                                          std::forward<ScaOp>(scalar_op));
+}
+
 }  // namespace vllm
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@ -59,6 +59,8 @@ void apply_repetition_penalties_(
  int vocab_size = logits.size(-1);
  int num_seqs = logits.size(0);

+  if (num_seqs == 0) return;
+
  // Get number of SMs on the current device
  int sms = 0;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@ -79,7 +79,8 @@ struct cutlass_sparse_3x_gemm {
  // These are the minimum alignments needed for the kernels to compile
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD = 4;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;

  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -393,6 +393,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      {stride_tag});
  ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);

+  // cutlass blockwise scaledgroup GEMM
+  ops.def(
+      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
+      "Tensor scales_a, Tensor scales_b, "
+      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
+      {stride_tag});
+  // conditionally compiled so impl registration is in source file
+
  // cutlass nvfp4 block scaled group GEMM
  ops.def(
      "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,3 +1,4 @@
+
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.

@ -62,12 +63,16 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 ARG PIP_KEYRING_PROVIDER=disabled
 ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}

+# Flag enables build-in KV-connector dependency libs into docker images
+ARG INSTALL_KV_CONNECTORS=false
+
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false
 ENV DEBIAN_FRONTEND=noninteractive

 ARG DEADSNAKES_MIRROR_URL
@ -276,6 +281,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
+ARG INSTALL_KV_CONNECTORS=false
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
@ -485,6 +491,7 @@ RUN mv mkdocs.yaml test_docs/
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -493,8 +500,13 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500

+COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
+        uv pip install --system -r requirements/kv_connectors.txt; \
+    fi; \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@ -14,7 +14,7 @@ Before setting up the incremental build:
    VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
    ```

-2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
+2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.

 3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.

--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis

 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

+If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
+
+```python
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+image_url = "https://picsum.photos/id/32/512/512"
+image_pil = ImageAsset('cherry_blossom').pil_image
+image_embeds = torch.load(...)
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant"},
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    {
+        "role": "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        },{
+            "type": "image_pil",
+            "image_pil": image_pil
+        }, {
+            "type": "image_embeds",
+            "image_embeds": image_embeds
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }],
+    },
+]
+
+# Perform inference and log output.
+outputs = llm.chat(conversation)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

 ??? Code
@ -228,7 +271,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.

-    For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
+    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.

 ### Image Inputs
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -470,6 +470,7 @@ Specified using `--task classify`.
 |----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
 | `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
 | `GPT2ForSequenceClassification`  | GPT2     | `nie3e/sentiment-polish-gpt2-small`    |                        |                             | ✅︎                     |
+
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.

@ -477,12 +478,20 @@ If your model is not in the above list, we will try to automatically convert the

 Specified using `--task score`.

-| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779)   |
-|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
-| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                       |
-| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                     |
-| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                       |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                       |
+| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779) |
+|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------|
+| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                     |
+| `Qwen2ForSequenceClassification`      | Qwen2-based       | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc.                                | ✅︎                  |
+| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                  |
+| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                     |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                     |
+
+!!! note
+    Load the official original `mxbai-rerank-v2` by using the following command.
+
+    ```bash
+    vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
+    ```

 !!! note
    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
@ -490,6 +499,7 @@ Specified using `--task score`.
    ```bash
    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
    ```
+
 [](){ #supported-mm-models }

 ## List of Multimodal Language Models
@ -616,9 +626,6 @@ Specified using `--task generate`.
 !!! note
    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.

-!!! note
-    `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
-
 !!! note
    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.

@ -661,11 +668,8 @@ Specified using `--task generate`.
    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.

 !!! note
-    To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
-    `pip install git+https://github.com/huggingface/transformers.git`.
-
-    Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
-    `--mm-processor-kwargs '{"use_audio_in_video": true}'`.
+    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
+    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.

 #### Transcription

--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -6,6 +6,7 @@ import argparse

 from vllm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset

 # This script is an offline demo for running Mistral-Small-3.1
 #
@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
    )

    prompt = "Describe this image in one sentence."
-    image_url = "https://picsum.photos/id/237/200/300"

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
            ],
        },
    ]
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@ -57,7 +57,10 @@ Once you have collected your profiles with this script, you can visualize them u
 Here are most likely the dependencies you need to install:

 ```bash
-pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
+pip install tensorflow-cpu \
+    tensorboard-plugin-profile \
+    etils \
+    importlib_resources
 ```

 Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from transformers import AutoTokenizer

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-6.7b",
+        model="Salesforce/blip2-opt-2.7b",
        limit_mm_per_prompt={modality: 1},
    )

@ -677,6 +677,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={modality: 1},
+        ignore_patterns=["consolidated.safetensors"],
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@ -970,7 +971,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    )


-# Qwen
+# Qwen-VL
 def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -505,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
+        ignore_patterns=["consolidated.safetensors"],
    )

    placeholders = "[IMG]" * len(image_urls)
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import socket
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from typing import Optional

--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@ -13,13 +13,15 @@ vllm serve Qwen/Qwen2.5-3B-Instruct
 To serve a reasoning model, you can use the following command:

 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
+    --reasoning-parser deepseek_r1
 ```

 If you want to run this script standalone with `uv`, you can use the following:

 ```bash
-uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
+    structured-output
 ```

 See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
@ -44,7 +46,9 @@ uv run structured_outputs.py --stream
 Run certain constraints, for example `structural_tag` and `regex`, streaming:

 ```bash
-uv run structured_outputs.py --constraint structural_tag regex --stream
+uv run structured_outputs.py \
+    --constraint structural_tag regex \
+    --stream
 ```

 Run all constraints, with reasoning models and streaming:
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@ -202,7 +202,7 @@ def parse_args():



-def deserialize():
+def deserialize(args, tensorizer_config):
    if args.lora_path:
        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
        llm = LLM(model=args.model,
@ -242,7 +242,7 @@ def deserialize():
    return llm


-if __name__ == '__main__':
+def main():
    args = parse_args()

    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
@ -260,8 +260,6 @@ if __name__ == '__main__':

    model_ref = args.model

-    model_name = model_ref.split("/")[1]
-
    if args.command == "serialize" or args.command == "deserialize":
        keyfile = args.keyfile
    else:
@ -309,6 +307,10 @@ if __name__ == '__main__':
                encryption_keyfile = keyfile,
                **credentials
            )
-        deserialize()
+        deserialize(args, tensorizer_config)
    else:
        raise ValueError("Either serialize or deserialize must be specified.")
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@ -0,0 +1 @@
+lmcache
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional

 import pytest
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import sys
+from unittest.mock import patch
+
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.v1.engine.async_llm import AsyncLLM
+
+
+def test_mp_reducer(monkeypatch):
+    """
+    Test that _reduce_config reducer is registered when AsyncLLM is instantiated
+    without transformers_modules. This is a regression test for
+    https://github.com/vllm-project/vllm/pull/18640.
+    """
+
+    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+    # Ensure transformers_modules is not in sys.modules
+    if 'transformers_modules' in sys.modules:
+        del sys.modules['transformers_modules']
+
+    with patch('multiprocessing.reducer.register') as mock_register:
+        engine_args = AsyncEngineArgs(
+            model="facebook/opt-125m",
+            max_model_len=32,
+            gpu_memory_utilization=0.1,
+            disable_log_stats=True,
+            disable_log_requests=True,
+        )
+
+        async_llm = AsyncLLM.from_engine_args(
+            engine_args,
+            start_engine_loop=False,
+        )
+
+        assert mock_register.called, (
+            "multiprocessing.reducer.register should have been called")
+
+        vllm_config_registered = False
+        for call_args in mock_register.call_args_list:
+            # Verify that a reducer for VllmConfig was registered
+            if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig:
+                vllm_config_registered = True
+
+                reducer_func = call_args[0][1]
+                assert callable(
+                    reducer_func), "Reducer function should be callable"
+                break
+
+        assert vllm_config_registered, (
+            "VllmConfig should have been registered to multiprocessing.reducer"
+        )
+
+        async_llm.shutdown()
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
    model_name = "openai/whisper-large-v3-turbo"
    server_args = ["--enforce-eager"]
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
-    prompt = "THE FIRST WORDS I SPOKE"
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
            temperature=0.0)
        out = json.loads(transcription)['text']
        assert "Mary had a little lamb," in out
-        # This should "force" whisper to continue prompt in all caps
-        transcription_wprompt = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            prompt=prompt,
-            temperature=0.0)
-        out_capital = json.loads(transcription_wprompt)['text']
-        assert prompt not in out_capital


@pytest.mark.asyncio
@ -238,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
            extra_body=dict(seed=42))

        assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
+    server_args = ["--enforce-eager"]
+    prompt = "This is a speech, recorded in a phonograph."
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        #Prompts should not omit the part of original prompt while transcribing.
+        prefix = "The first words I spoke in the original phonograph"
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert prefix in out
+        transcription_wprompt = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            prompt=prompt,
+            temperature=0.0)
+        out_prompt = json.loads(transcription_wprompt)['text']
+        assert prefix in out_prompt
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
            }, {
                "type": "text",
                "text": "What's in these images?"
@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
            }, {
                "type": "text",
                "text": "What's in these images?"
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@ -450,7 +450,8 @@ def test_multi_query_kv_attention(
            start += seq_len
        # xformers.AttentionBias to Tensor for use in reference impl.
        alibi_bias = [
-            b.materialize(b.shape, device=device).squeeze() for b in attn_bias
+            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
+            for b, i in zip(attn_bias, seq_lens)
        ]
    else:
        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -171,7 +171,7 @@ def test_env(
                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected
                else:
-                    backend = get_attn_backend(16,
+                    backend = get_attn_backend(32,
                                               torch.float16,
                                               torch.float16,
                                               block_size,
@ -180,6 +180,45 @@ def test_env(
                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected

+                    if use_v1:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        assert backend.get_name() == "FLEX_ATTENTION", (
+                            "Should fallback to FlexAttention if head size is "
+                            "not supported by FlashAttention")
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_fp32_fallback(
+    device: str,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with fp32."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
+                    if use_v1 else "TORCH_SDPA")
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "FLEX_ATTENTION"
+                    if use_v1 else "XFORMERS")
+

 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 DeepEP test utilities
 """
@ -137,8 +138,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
                            low_latency_mode=low_latency_mode,
                            num_qps_per_rank=num_qps_per_rank)
    return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      world_size=pgi.world_size,
-                                      rank=pgi.rank,
+                                      num_dispatchers=pgi.world_size,
                                      dp_size=dp_size,
                                      rank_expert_offset=pgi.rank *
                                      ht_args.num_local_experts)
@ -146,7 +146,6 @@ def make_deepep_ht_a2a(pg: ProcessGroup,

 def make_deepep_ll_a2a(pg: ProcessGroup,
                       pgi: ProcessGroupInfo,
-                       dp_size: int,
                       deepep_ll_args: DeepEPLLArgs,
                       q_dtype: Optional[torch.dtype] = None,
                       block_shape: Optional[list[int]] = None):
@ -166,8 +165,7 @@ def make_deepep_ll_a2a(pg: ProcessGroup,

    return DeepEPLLPrepareAndFinalize(
        buffer=buffer,
-        world_size=pgi.world_size,
-        dp_size=dp_size,
+        num_dispatchers=pgi.world_size,
        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
    )
@ -186,5 +184,4 @@ def make_deepep_a2a(pg: ProcessGroup,
                                  block_shape)

    assert deepep_ll_args is not None
-    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
-                              block_shape)
+    return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@ -10,7 +10,7 @@ import triton.language as tl

 from tests.kernels.moe.utils import (batched_moe,
                                     make_quantized_test_activations,
-                                     make_test_weights, triton_moe)
+                                     make_test_weights, naive_batched_moe)
 from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
@ -33,12 +33,10 @@ MNK_FACTORS = [
    (45, 512, 512),
    (45, 1024, 128),
    (45, 1024, 2048),
-    (64, 128, 128),
    (64, 512, 512),
    (64, 1024, 2048),
    (222, 128, 128),
    (222, 128, 2048),
-    (222, 512, 512),
    (222, 1024, 128),
    (222, 1024, 2048),
 ]
@ -95,11 +93,12 @@ class BatchedMMTensors:
@pytest.mark.parametrize("max_tokens_per_expert",
                         [32, 64, 128, 192, 224, 256, 512])
@pytest.mark.parametrize("K", [128, 256, 1024])
-@pytest.mark.parametrize("N", [128, 256, 512, 1024])
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("block_shape", [None])
-@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("N", [128, 256, 1024])
+@pytest.mark.parametrize(
+    "dtype",
+    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
                    N: int, dtype: torch.dtype,
                    block_shape: Optional[list[int]],
@ -134,7 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant)
+        per_act_token_quant=per_act_token_quant,
+    )

    B, B_q, B_scale, _, _, _ = make_test_weights(
        num_experts,
@ -143,6 +143,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
    )

    out_shape = (num_experts, max_tokens_per_expert, N)
@ -177,6 +178,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
            "BLOCK_SIZE_N": 16,
            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
        },
+        per_act_token_quant=per_act_token_quant,
        block_shape=block_shape,
    )

@ -185,15 +187,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        B,
        ref_output,
        num_expert_tokens,
-        None,
-        None,
-        None,
    )

    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
                                                      num_expert_tokens,
                                                      A_scale, B_scale,
-                                                      block_shape)
+                                                      block_shape,
+                                                      per_act_token_quant)

    rtol, atol = {
        torch.float16: (6e-2, 6e-2),
@ -201,16 +201,17 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        torch.float32: (1e-2, 1e-2),
    }[test_output.dtype]

-    torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)


@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("per_act_token_quant", [False])
-@pytest.mark.parametrize("block_shape", [None])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("input_scales", [False])
 def test_fused_moe_batched_experts(
    m: int,
    n: int,
@ -220,15 +221,19 @@ def test_fused_moe_batched_experts(
    dtype: torch.dtype,
    per_act_token_quant: bool,
    block_shape: Optional[list[int]],
+    input_scales: bool,
 ):
    current_platform.seed_everything(7)

    use_fp8_w8a8 = dtype == torch.float8_e4m3fn

+    if topk > e:
+        pytest.skip("topk > e")
+
    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")

-    if per_act_token_quant and block_shape is not None or topk > e:
+    if per_act_token_quant and block_shape is not None:
        pytest.skip("Skip illegal quantization test.")

    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
@ -241,27 +246,26 @@ def test_fused_moe_batched_experts(
        act_dtype = dtype
        quant_dtype = None

-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
-                                                 n,
-                                                 k,
-                                                 block_shape=block_shape,
-                                                 in_dtype=act_dtype,
-                                                 quant_dtype=quant_dtype)
+    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        block_shape=block_shape,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    if input_scales and quant_dtype is not None:
+        a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        batched_output = batched_moe(
-            a,
-            w1,
-            w2,
-            topk_weight,
-            topk_ids,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            quant_dtype=quant_dtype,
-            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape,
-        )
+
        baseline_output = torch_experts(
            a,
            w1,
@ -270,11 +274,14 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape)
+            block_shape=block_shape,
+        )

-        triton_output = triton_moe(
+        batched_output = naive_batched_moe(
            a,
            w1,
            w2,
@ -282,14 +289,31 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )

-    torch.testing.assert_close(triton_output,
+        triton_output = batched_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+    torch.testing.assert_close(batched_output,
                               baseline_output,
-                               atol=2e-2,
+                               atol=3e-2,
                               rtol=2e-2)

    torch.testing.assert_close(triton_output,
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# DeepGEMM Style Cutlass Grouped GEMM Test
+# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.utils import baseline_scaled_mm
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+def per_token_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (128 - (n % 128)) % 128
+    x = torch.nn.functional.pad(x,
+                                (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view *
+                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
+                           device=x.device,
+                           dtype=x.dtype)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+
+
+@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
+    (4, 8192, 7168, 4096),
+    (4, 8192, 2048, 7168),
+    (8, 4096, 7168, 4096),
+    (8, 4096, 2048, 7168),
+    (32, 1024, 7168, 4096),
+    (32, 1024, 2048, 7168),
+])
+@pytest.mark.parametrize("out_dtype", [torch.float16])
+@pytest.mark.skipif(
+    (lambda x: x is None or x.to_int() != 100)(
+        current_platform.get_device_capability()),
+    reason="Block Scaled Grouped GEMM is only supported on SM100.")
+def test_cutlass_grouped_gemm(
+    num_groups: int,
+    expected_m_per_group: int,
+    k: int,
+    n: int,
+    out_dtype: torch.dtype,
+):
+    device = "cuda"
+    alignment = 128
+    group_ms = [
+        int(expected_m_per_group * random.uniform(0.7, 1.3))
+        for _ in range(num_groups)
+    ]
+    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
+
+    x = torch.randn((m, k), device=device, dtype=out_dtype)
+    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
+    out = torch.empty((m, n), device=device, dtype=out_dtype)
+    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
+
+    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
+    pb_size = []
+    for i in range(num_groups):
+        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
+    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
+    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
+
+    x_fp8 = per_token_cast_to_fp8(x)
+    y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
+             torch.empty((num_groups, cdiv(n, 128), k // 128),
+                         device=device,
+                         dtype=torch.float))
+    for i in range(num_groups):
+        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
+
+    for i in range(num_groups):
+        a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
+        a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
+        b = y_fp8[0][i].t()
+        b_scale = y_fp8[1][i].t()
+        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
+        ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
+
+    ops.cutlass_blockwise_scaled_grouped_mm(
+        out,
+        x_fp8[0],
+        y_fp8[0],
+        x_fp8[1],
+        y_fp8[1],
+        problem_sizes,
+        expert_offsets[:-1],
+    )
+
+    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test DeepEP + DeepGEMM integration
 DeepGEMM are gemm kernels specialized for the
@ -148,8 +149,7 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,

    fused_experts = BatchedDeepGemmExperts(
        max_num_tokens=max_tokens_per_rank,
-        world_size=pgi.world_size,
-        dp_size=dp_size,
+        num_dispatchers=pgi.world_size // dp_size,
        block_shape=test_config.block_size,
        per_act_token_quant=test_config.per_act_token_quant)
    mk = FusedMoEModularKernel(prepare_finalize=a2a,
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test deepep dispatch-combine logic
 """
@ -154,12 +155,13 @@ def make_modular_kernel(
                        deepep_ht_args = ht_args,
                        deepep_ll_args = ll_args)

+    num_dispatchers = pgi.world_size // dp_size
+
    if low_latency_mode:
        assert not per_act_token_quant, "not supported in ll mode"
        fused_experts = BatchedTritonExperts(
            max_num_tokens=MAX_TOKENS_PER_RANK,
-            world_size=pgi.world_size,
-            dp_size=dp_size,
+            num_dispatchers=num_dispatchers,
            use_fp8_w8a8=is_quantized,
            use_int8_w8a8=False,
            use_int8_w8a16=False,
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Unit-test DeepGEMM FP8 kernels (no DeepEP).
 Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
 from vllm.platforms import current_platform
+from vllm.utils import cdiv

 from .parallel_utils import ProcessGroupInfo, parallel_launch

@ -112,18 +113,21 @@ def pplx_cutlass_moe(
    w2_scale = w2_scale.to(device)
    a1_scale = a1_scale.to(device)

+    assert num_experts % world_size == 0
+    num_local_experts = cdiv(num_experts, world_size)
+    num_dispatchers = pgi.world_size // dp_size
+
    prepare_finalize = PplxPrepareAndFinalize(
        ata,
-        max_num_tokens,
-        pgi.world_size,
-        rank,
-        dp_size,
-    )
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=num_dispatchers)

-    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
+    experts = CutlassExpertsFp8(num_local_experts,
                                out_dtype,
                                per_act_token,
                                per_out_ch,
+                                num_dispatchers=num_dispatchers,
                                use_batched_format=True)

    fused_cutlass_experts = FusedMoEModularKernel(
@ -181,35 +185,40 @@ def _pplx_moe(
    per_out_ch: bool,
    use_internode: bool,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name

-    with set_current_vllm_config(vllm_config):
-        torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
-                                     topk_ids)
-        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
-                                       w2_scale, topk_weights, topk_ids,
-                                       a1_scale, out_dtype, per_act_token,
-                                       per_out_ch, group_name)
+        with set_current_vllm_config(vllm_config):
+            torch_output = torch_experts(a_full, w1_full, w2_full,
+                                         topk_weights, topk_ids)
+            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                           w2_scale, topk_weights, topk_ids,
+                                           a1_scale, out_dtype, per_act_token,
+                                           per_out_ch, group_name)

-        torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                     pgi.world_size).to(pplx_output.device)
+            torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                         pgi.world_size).to(pplx_output.device)

-    # Uncomment if more debugging is needed
-    # print("PPLX OUT:", pplx_output)
-    # print("TORCH OUT:", torch_output)
+        # Uncomment if more debugging is needed
+        # print("PPLX OUT:", pplx_output)
+        # print("TORCH OUT:", torch_output)

-    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
-
-    if use_internode:
-        nvshmem_finalize()
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=0.05,
+                                   rtol=0)
+    finally:
+        if use_internode:
+            nvshmem_finalize()


@pytest.mark.parametrize("m", [2, 224])
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@ -4,7 +4,10 @@

 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-from typing import Optional
+import itertools
+import textwrap
+import traceback
+from typing import Callable, Optional

 import pytest
 import torch
@ -19,12 +22,13 @@ except ImportError:
    has_pplx = False

 from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
+from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+    BatchedTritonExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
@ -38,22 +42,22 @@ requires_pplx = pytest.mark.skipif(
    reason="Requires PPLX kernels",
 )

-PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
-                       (222, 2048, 1024)]
-
-PPLX_MOE_COMBOS = [
-    (1, 128, 128),
+PPLX_COMBOS = [
+    # TODO: figure out why this fails, seems to be test problem
+    #(1, 128, 128),
    (2, 128, 512),
    (3, 1024, 2048),
-    (32, 128, 1024),
+    (4, 128, 128),
+    (32, 1024, 512),
    (45, 512, 2048),
-    (64, 1024, 1024),
-    (222, 1024, 2048),
+    (64, 1024, 512),
+    (222, 2048, 1024),
+    (256, 1408, 2048),
 ]

 NUM_EXPERTS = [8, 64]
-EP_SIZE = [1, 4]
 TOP_KS = [1, 2, 6]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]

 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@ -169,9 +173,11 @@ def test_fused_moe_batched_experts(

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
+        baseline_output = torch_experts(a, w1, w2, topk_weight,
+                                        topk_ids)  # only for baseline
        torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(
+            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this

    torch.testing.assert_close(baseline_output,
                               torch_output,
@ -183,6 +189,63 @@ def test_fused_moe_batched_experts(
                               rtol=0)


+def create_pplx_prepare_finalize(
+    num_tokens: int,
+    hidden_dim: int,
+    topk: int,
+    num_experts: int,
+    rank: int,
+    dp_size: int,
+    world_size: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    group_name: Optional[str],
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
+
+    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
+    num_local_experts = rank_chunk(num_experts, 0, world_size)
+
+    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
+        hidden_dim,
+        in_dtype,
+        quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+    )
+
+    args = dict(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim_bytes,
+        hidden_dim_scale_bytes=scale_bytes,
+    )
+
+    if group_name is None:
+        ata = AllToAll.internode(**args)
+    else:
+        args["group_name"] = group_name
+        ata = AllToAll.intranode(**args)
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=world_size // dp_size,
+    )
+
+    return prepare_finalize, ata
+
+
 def rank_chunk(num: int, r: int, w: int) -> int:
    rem = num % w
    return (num // w) + (1 if r < rem else 0)
@ -193,6 +256,35 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
    return t[(r * chunk):(r + 1) * chunk]


+def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
+                        w: int) -> Optional[torch.Tensor]:
+    if t is not None:
+        return chunk_by_rank(t, r, w)
+    else:
+        return t
+
+
+def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
+                         w: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        chunk = rank_chunk(t.shape[0], r, w)
+        return t[(r * chunk):(r + 1) * chunk]
+    else:
+        return t
+
+
+def chunk_scales(t: Optional[torch.Tensor], start: int,
+                 end: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        return t[start:end]
+    else:
+        return t
+
+
+def dummy_work(a: torch.Tensor) -> torch.Tensor:
+    return a * 1.1
+
+
 def pplx_prepare_finalize(
    pgi: ProcessGroupInfo,
    dp_size: int,
@ -200,11 +292,11 @@ def pplx_prepare_finalize(
    topk_weight: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
    group_name: Optional[str],
 ) -> torch.Tensor:
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
-
    assert torch.cuda.current_device() == pgi.local_rank

    topk = topk_ids.shape[1]
@ -212,60 +304,66 @@ def pplx_prepare_finalize(
    device = pgi.device
    rank = pgi.rank
    world_size = pgi.world_size
-    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
-
-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
-        hidden_dim_scale_bytes=0,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)

    topk_ids = topk_ids.to(dtype=torch.uint32)

-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens,
-        world_size,
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
+        hidden_dim,
+        topk,
+        num_experts,
        rank,
        dp_size,
+        world_size,
+        a.dtype,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
    )

+    assert a.shape[0] == topk_ids.shape[0]
+
    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)

+    assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
+
+    out = torch.full(
+        a_chunk.shape,
+        torch.nan,
+        dtype=a.dtype,
+        device=device,
+    )
+
+    if (quant_dtype is not None and not per_act_token_quant
+            and block_shape is None):
+        a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None
+
    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
        a_chunk,
-        None,
-        None,
+        a1_scale,
+        a2_scale,
        chunk_topk_weight,
        chunk_topk_ids,
        num_experts,
        None,
        False,
-        FusedMoEQuantConfig(),
+        FusedMoEQuantConfig(
+            quant_dtype,
+            per_act_token_quant,
+            False,
+            block_shape,
+        ),
    )

-    b_a = b_a * 1.5
-
-    out = torch.full(
-        (max_num_tokens, hidden_dim),
-        torch.nan,
-        dtype=a.dtype,
-        device=device,
-    )
+    b_a = dummy_work(
+        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))

    prepare_finalize.finalize(
        out,
@ -291,70 +389,96 @@ def _pplx_prepare_finalize(
    score: torch.Tensor,
    topk: torch.Tensor,
    num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
    use_internode: bool,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-        group_name = None
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name

-    device = pgi.device
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        m, k = a.shape

-    topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-    k = a.shape[1]
+        a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)

-    a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
+        torch_output = (a_rep.view(m, topk, k) *
+                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
+                            dim=1)

-    torch_output = (a_rep.view(-1, topk, k) * 1.5 *
-                    topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
-                        a.dtype)
+        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
+                                            topk_ids, num_experts, quant_dtype,
+                                            block_shape, per_act_token_quant,
+                                            group_name)

-    pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
-                                        num_experts, group_name)
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pgi.device)

-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
-
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-
-    if use_internode:
-        nvshmem_finalize()
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()


-# TODO (bnell): this test point does not work for odd M due to how the test is
-# written, not due to limitations of the pplx kernels.  The pplx_moe
-# test below is able to deal with odd M.
-# TODO (bnell) add fp8 tests
-@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
@requires_pplx
-def test_pplx_prepare_finalize(
+def test_pplx_prepare_finalize_slow(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
    dtype: torch.dtype,
    world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
    use_internode: bool,
 ):
+    if dtype == torch.float8_e4m3fn:
+        use_fp8_w8a8 = True
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        use_fp8_w8a8 = False
+        act_dtype = dtype
+        quant_dtype = None
+
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
    current_platform.seed_everything(7)
    m, n, k = mnk
    world_size, dp_size = world_dp_size
    device = "cuda"
-    a = torch.randn((m, k), device=device, dtype=dtype) / 10
-    score = torch.randn((m, e), device=device, dtype=dtype)
+
+    a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
+    score = torch.randn((m, e), device=device, dtype=act_dtype)

    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
-                    topk, e, use_internode)
+                    topk, e, quant_dtype, block_shape, per_act_token_quant,
+                    use_internode)


 def pplx_moe(
@ -369,84 +493,62 @@ def pplx_moe(
    topk_ids: torch.Tensor,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
    per_act_token_quant=False,
    block_shape: Optional[list[int]] = None,
    use_compile: bool = False,
    use_cudagraphs: bool = True,
 ) -> torch.Tensor:
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)

-    device = torch.device("cuda", rank)
-    hidden_dim = a.shape[1]
+    num_tokens, hidden_dim = a.shape
    num_experts = w1.shape[0]
    topk = topk_ids.shape[1]
-    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
+    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)

-    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
-        max_num_tokens,
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
        hidden_dim,
+        topk,
+        num_experts,
+        rank,
+        dp_size,
+        world_size,
        a.dtype,
-        qtype,
-        per_act_token_quant=per_act_token_quant,
-        block_shape=block_shape,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
    )

-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim_bytes,
-        hidden_dim_scale_bytes=scale_bytes,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)
-
    topk_ids = topk_ids.to(dtype=torch.uint32)

-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens,
-        world_size,
-        rank,
-        dp_size,
+    experts = BatchedTritonExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=prepare_finalize.num_dispatchers(),
+        use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
    )

-    experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
-                                   world_size=world_size,
-                                   dp_size=dp_size,
-                                   use_fp8_w8a8=qtype == torch.float8_e4m3fn,
-                                   block_shape=block_shape)
-
    fused_experts = FusedMoEModularKernel(
        prepare_finalize,
        experts,
    )

    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
+    a_chunk = chunk_by_rank(a, rank, world_size)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)

    # Chunking weights like this only works for batched format
-    w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
-    w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
-
-    if w1_scale is not None:
-        w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
-        w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
-    else:
-        w1_scale_chunk = None
-        w2_scale_chunk = None
+    w1_chunk = chunk_by_rank(w1, rank, world_size)
+    w2_chunk = chunk_by_rank(w2, rank, world_size)
+    w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
+    w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
+    a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
+    a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)

    # Note: for now use_compile will error out if the problem size is
    # large enough to trigger chunking. I'm leaving the flag and
@ -468,6 +570,8 @@ def pplx_moe(
                         chunk_topk_ids,
                         w1_scale=w1_scale_chunk,
                         w2_scale=w2_scale_chunk,
+                         a1_scale=a1_scale_chunk,
+                         a2_scale=a2_scale_chunk,
                         global_num_experts=num_experts)

    if use_cudagraphs:
@ -482,6 +586,8 @@ def pplx_moe(
                                 chunk_topk_ids,
                                 w1_scale=w1_scale_chunk,
                                 w2_scale=w2_scale_chunk,
+                                 a1_scale=a1_scale_chunk,
+                                 a2_scale=a2_scale_chunk,
                                 global_num_experts=num_experts)

        torch.cuda.synchronize()
@ -494,48 +600,6 @@ def pplx_moe(
    return out


-def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
-    assert torch.cuda.current_device() == pgi.local_rank
-
-    num_experts = w1.shape[0]
-    device = pgi.device
-    rank = pgi.rank
-    world_size = pgi.world_size
-    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
-
-    prepare_finalize = BatchedPrepareAndFinalize(
-        max_num_tokens=max_num_tokens,
-        world_size=world_size,
-        dp_size=dp_size,
-        rank=rank,
-    )
-
-    experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
-                                  world_size=1,
-                                  dp_size=1)
-
-    fused_experts = FusedMoEModularKernel(
-        prepare_finalize,
-        experts,
-    )
-
-    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
-
-    out = fused_experts(
-        a_chunk,
-        # Chunking weights like this only works for batched format
-        chunk_by_rank(w1, rank, world_size).to(device),
-        chunk_by_rank(w2, rank, world_size).to(device),
-        chunk_topk_weight,
-        chunk_topk_ids,
-        global_num_experts=num_experts)
-
-    return out
-
-
 def _pplx_moe(
    pgi: ProcessGroupInfo,
    dp_size: int,
@ -544,75 +608,130 @@ def _pplx_moe(
    w2: torch.Tensor,
    score: torch.Tensor,
    topk: int,
+    num_experts: int,
    w1_s: Optional[torch.Tensor] = None,
    w2_s: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    quant_dtype: Optional[torch.dtype] = None,
    per_act_token_quant: bool = False,
    block_shape: Optional[list[int]] = None,
    use_internode: bool = False,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-        group_name = None
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name

-    m, k = a.shape
-    e, _, n = w2.shape
+        m, k = a.shape
+        e, _, n = w2.shape

-    moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
+        moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)

-    device = torch.device("cuda", pgi.rank)
-    a = a.to(device)
-    w1 = w1.to(device)
-    w2 = w2.to(device)
-    w1_s = w1_s.to(device) if w1_s is not None else None
-    w2_s = w2_s.to(device) if w2_s is not None else None
+        device = torch.device("cuda", pgi.rank)
+        rank = pgi.rank
+        world_size = pgi.world_size

-    with set_current_vllm_config(vllm_config), override_config(moe_config):
-        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        torch_output = torch_experts(a,
-                                     w1,
-                                     w2,
-                                     topk_weight,
-                                     topk_ids,
-                                     w1_scale=w1_s,
-                                     w2_scale=w2_s,
-                                     quant_dtype=qtype,
-                                     per_act_token_quant=per_act_token_quant,
-                                     block_shape=block_shape)
-        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
-                               a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
-                               qtype, per_act_token_quant, block_shape)
-        # TODO (bnell): fix + re-enable
-        #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
-        #                              topk_ids)
+        a = a.to(device)
+        w1 = w1.to(device)
+        w2 = w2.to(device)
+        w1_s = w1_s.to(device) if w1_s is not None else None
+        w2_s = w2_s.to(device) if w2_s is not None else None

-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
+        if (quant_dtype is not None and not per_act_token_quant
+                and block_shape is None):
+            a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+            a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        else:
+            a1_scale = None
+            a2_scale = None

-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-    #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
+        with set_current_vllm_config(vllm_config), override_config(moe_config):
+            topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)

-    if use_internode:
-        nvshmem_finalize()
+            torch_output = torch_experts(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            batched_output = naive_batched_moe(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            pplx_output = pplx_moe(
+                group_name,
+                rank,
+                world_size,
+                dp_size,
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+        chunked_batch_output = chunk_by_rank(
+            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
+
+        torch.testing.assert_close(batched_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+
+        torch.testing.assert_close(pplx_output,
+                                   chunked_batch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()


-@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
@requires_pplx
-def test_pplx_moe(
+def test_pplx_moe_slow(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
@ -633,18 +752,143 @@ def test_pplx_moe(
        use_fp8_w8a8 = False
        quant_dtype = None

-    if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")

+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)

-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
-                                                 n,
-                                                 k,
-                                                 quant_dtype=quant_dtype,
-                                                 block_shape=block_shape)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )

-    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
+    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
                    use_internode)
+
+
+def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
+                    make_weights: bool, test_fn: Callable):
+
+    def format_result(msg, ex=None):
+        if ex is not None:
+            x = str(ex)
+            newx = x.strip(" \n\t")[:16]
+            if len(newx) < len(x):
+                newx = newx + " ..."
+
+            prefix = "E\t"
+            print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
+            print(f"FAILED {msg} - {newx}\n")
+        else:
+            print(f"PASSED {msg}")
+
+    current_platform.seed_everything(7)
+    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
+                               [False, True], [None, [128, 128]])
+    exceptions = []
+    count = 0
+    for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
+        count = count + 1
+        m, n, k = mnk
+
+        if dtype == torch.float8_e4m3fn:
+            use_fp8_w8a8 = True
+            quant_dtype = dtype
+        else:
+            use_fp8_w8a8 = False
+            quant_dtype = None
+
+        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
+                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
+                     f"block_shape={block_shape}")
+
+        if not use_fp8_w8a8 and (per_act_token_quant
+                                 or block_shape is not None):
+            print(
+                f"{test_desc} - Skip quantization test for non-quantized type."
+            )
+            continue
+
+        if per_act_token_quant and block_shape is not None:
+            print(f"{test_desc} - Skip illegal quantization combination.")
+            continue
+
+        a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+        args = dict()
+        if make_weights:
+            _, w1, w1_s, _, w2, w2_s = make_test_weights(
+                e,
+                n,
+                k,
+                quant_dtype=quant_dtype,
+                block_shape=block_shape,
+                per_act_token_quant=per_act_token_quant,
+            )
+            args["w1"] = w1
+            args["w2"] = w2
+            args["w1_s"] = w1_s
+            args["w2_s"] = w2_s
+
+        try:
+            test_fn(
+                pgi=pgi,
+                dp_size=dp_size,
+                a=a,
+                score=score,
+                topk=topk,
+                num_experts=e,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+                use_internode=use_internode,
+                **args,
+            )
+            format_result(test_desc)
+        except Exception as ex:
+            format_result(test_desc, ex)
+            exceptions.append(ex)
+
+    if len(exceptions) > 0:
+        raise RuntimeError(
+            f"{len(exceptions)} of {count} tests failed in child process, "
+            f"rank={pgi.rank}.")
+    else:
+        print(f"{count} of {count} tests passed in child process, "
+              f"rank={pgi.rank}.")
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_prepare_finalize(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
+                    use_internode, False, _pplx_prepare_finalize)
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_moe(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
+                    _pplx_moe)
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@ -63,13 +63,12 @@ def batched_moe(

    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
-                                  dp_size=1,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
                                  rank=0),
        BatchedTritonExperts(
            max_num_tokens=max_num_tokens,
-            world_size=1,
-            dp_size=1,
+            num_dispatchers=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
@ -105,13 +104,12 @@ def naive_batched_moe(

    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
-                                  dp_size=1,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
                                  rank=0),
        NaiveBatchedExperts(
            max_num_tokens=max_num_tokens,
-            dp_size=1,
-            world_size=1,
+            num_dispatchers=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@ -277,6 +277,24 @@ def dequant(
        return t.to(out_dtype)


+def batched_dequant(
+    t: torch.Tensor,
+    scale: Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    out_dtype: Optional[torch.dtype] = torch.float32,
+) -> torch.Tensor:
+    if scale is not None:
+        assert t.shape[0] == scale.shape[0]
+        out = torch.empty_like(t, dtype=out_dtype)
+        for e in range(t.shape[0]):
+            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
+                             out_dtype)
+        return out
+
+    return t.to(out_dtype)
+
+
 def native_batched_masked_quant_matmul(
    A: torch.Tensor,
    B: torch.Tensor,
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch

@ -74,3 +75,51 @@ def test_apply_repetition_penalties(
    # Test the operator by applying the opcheck utility
    opcheck(torch.ops._C.apply_repetition_penalties_,
            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties_zero_seqs() -> None:
+    """
+    Test the apply_repetition_penalties custom op with num_seqs=0
+    against a reference implementation.
+    """
+    num_seqs = 0
+    vocab_size = 17
+    repetition_penalty = 1.05
+    dtype = torch.float32
+    seed = 0
+
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # No tokens to mark as repeated since num_seqs=0
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Integration tests for FlexAttention backend vs default backend"""

 import random
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@ -1094,6 +1094,8 @@ def torch_experts(
    if expert_map is not None:
        topk_ids = expert_map[topk_ids]

+    f32 = torch.float32
+
    for i in range(num_experts):
        mask = topk_ids == i
        if mask.sum():
@ -1109,7 +1111,8 @@ def torch_experts(
                                                out.dtype)
                tmp2 = SiluAndMul()(tmp1)
                tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, None, quant_dtype, per_act_token_quant, block_shape)
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
+                    block_shape)

                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
                                                     w2_scale[i], block_shape,
@ -1117,7 +1120,6 @@ def torch_experts(
            else:
                assert (a_scale is not None and w1_scale is not None
                        and w2_scale is not None)
-                f32 = torch.float32
                scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
                tmp1 = a[mask].to(f32) * scales
                w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
@ -1126,8 +1128,8 @@ def torch_experts(
                w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                out[mask] = (tmp2 @ w2_dq).to(out.dtype)

-    return (out.view(M, -1, w2.shape[1]) *
-            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+    return (out.view(M, -1, w2.shape[1]).to(f32) *
+            topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)


 def torch_moe(a: torch.Tensor,
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
           model_runner.model)


-@pytest.fixture(params=[True, False])
-def run_with_both_engines_lora(request, monkeypatch):
-    # Automatically runs tests twice, once with V1 and once without
-    use_v1 = request.param
-    # Tests decorated with `@skip_v1` are only run without v1
-    skip_v1 = request.node.get_closest_marker("skip_v1")
-
-    if use_v1:
-        if skip_v1:
-            pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
-    else:
-        monkeypatch.setenv('VLLM_USE_V1', '0')
-
-    yield
-
-
@pytest.fixture
 def reset_default_device():
    """
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -3,6 +3,7 @@

 import pytest

+from tests.models.registry import HF_EXAMPLE_MODELS
 from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
@ -19,31 +20,55 @@ pytestmark = pytest.mark.hybrid_model
 SSM_MODELS = [
    "state-spaces/mamba-130m-hf",
    "tiiuae/falcon-mamba-tiny-dev",
-    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
    "mistralai/Mamba-Codestral-7B-v0.1",
 ]

 HYBRID_MODELS = [
    "ai21labs/Jamba-tiny-dev",
-    # NOTE: Currently the test failes due to HF transformers issue fixed in:
-    # https://github.com/huggingface/transformers/pull/39033
-    # We will enable vLLM test for Granite after next HF transformers release.
-    # "ibm-granite/granite-4.0-tiny-preview",
    # NOTE: Running Plamo2 in transformers implementation requires to install
    # causal-conv1d package, which is not listed as a test dependency as it's
    # not compatible with pip-compile.
    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
    "hmellor/tiny-random-BambaForCausalLM",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+]
+
+HF_UNSUPPORTED_MODELS = [
+    # The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
+    # doesn't compare vLLM output with HF output.
+    # See https://github.com/huggingface/transformers/pull/35943
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
+    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
+    "nvidia/Nemotron-H-8B-Base-8K",
+    # NOTE: Currently the test fails due to HF transformers issue fixed in:
+    # https://github.com/huggingface/transformers/pull/39033
+    # We will enable vLLM test for Granite after next HF transformers release.
+    "ibm-granite/granite-4.0-tiny-preview",
 ]

 V1_SUPPORTED_MODELS = [
    "mistralai/Mamba-Codestral-7B-v0.1",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
 ]

+ATTN_BLOCK_SIZES = {
+    "ibm-ai-platform/Bamba-9B-v1": 528,
+    "Zyphra/Zamba2-1.2B-instruct": 80,
+    "nvidia/Nemotron-H-8B-Base-8K": 528,
+    "ibm-granite/granite-4.0-tiny-preview": 400,
+    "tiiuae/Falcon-H1-0.5B-Base": 800,
+}
+
 # Avoid OOM
 MAX_NUM_SEQS = 4

@ -60,8 +85,16 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
    with hf_runner(model) as hf_model:
-        if model != "mistralai/Mamba-Codestral-7B-v0.1":
+        if model not in HF_UNSUPPORTED_MODELS:
            hf_outputs = hf_model.generate_greedy_logprobs_limit(
                example_prompts, max_tokens, num_logprobs)
        else:
@ -72,12 +105,21 @@ def test_models(
            example_prompts, max_tokens, num_logprobs)

    if model in V1_SUPPORTED_MODELS:
+        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
+            block_size = ATTN_BLOCK_SIZES[model]
+        else:
+            block_size = 16
+
        with monkeypatch.context() as m:
            m.setenv("VLLM_USE_V1", "1")
+            if model in HYBRID_MODELS:
+                # required due to reorder_batch behaviour
+                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
            with vllm_runner(model,
                             max_num_seqs=MAX_NUM_SEQS,
                             enforce_eager=True,
-                             enable_prefix_caching=False) as vllm_model:
+                             enable_prefix_caching=False,
+                             block_size=block_size) as vllm_model:
                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                    example_prompts, max_tokens, num_logprobs)
    else:
@ -111,6 +153,14 @@ def test_batching(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
    for_loop_outputs = []
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        for prompt in example_prompts:
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from typing import Optional

 import pytest

@ -74,6 +75,13 @@ def test_models(
        vllm_extra_kwargs["override_pooler_config"] = \
            PoolerConfig(pooling_type="MEAN", normalize=False)

+    max_model_len: Optional[int] = 512
+    if model in [
+            "sentence-transformers/all-MiniLM-L12-v2",
+            "sentence-transformers/stsb-roberta-base-v2"
+    ]:
+        max_model_len = None
+
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
@ -87,7 +95,7 @@ def test_models(

    with vllm_runner(model,
                     task="embed",
-                     max_model_len=512,
+                     max_model_len=max_model_len,
                     **vllm_extra_kwargs) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)

--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@ -56,10 +56,16 @@ MODELS = [
                   enable_test=False),
 ]

+V1FlashAttentionImpNotSupported = [
+    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
+]
+

@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
+                           monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")

    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":
@ -71,8 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner,

@pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
+                                  model_info: EmbedModelInfo, example_prompts,
+                                  monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")

    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

 from ...utils import EmbedModelInfo
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=True),
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=False)
+]
+
+
+class MxbaiRerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
+        self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
+
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def process_inputs(pairs):
+            inputs = self.tokenizer(pairs,
+                                    padding=False,
+                                    truncation='longest_first',
+                                    return_attention_mask=False)
+            for i, ele in enumerate(inputs['input_ids']):
+                inputs['input_ids'][i] = ele
+            inputs = self.tokenizer.pad(inputs,
+                                        padding=True,
+                                        return_tensors="pt")
+            for key in inputs:
+                inputs[key] = inputs[key].to(self.model.device)
+            return inputs
+
+        @torch.no_grad()
+        def compute_logits(inputs):
+            logits = self.model(**inputs).logits[:, -1, :]
+            yes_logits = logits[:, self.yes_loc]
+            no_logits = logits[:, self.no_loc]
+            logits = yes_logits - no_logits
+            scores = logits.float().sigmoid()
+            return scores
+
+        scores = []
+        for prompt in prompts:
+            inputs = process_inputs([prompt])
+            score = compute_logits(inputs)
+            scores.append(score[0].item())
+        return torch.Tensor(scores)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    vllm_extra_kwargs: dict[str, Any] = {}
+    if model_info.architecture == "Qwen2ForSequenceClassification":
+        vllm_extra_kwargs["hf_overrides"] = {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        }
+
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -33,9 +33,6 @@ if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

 REQUIRES_V0_MODELS = [
-    # V1 Test: no way to fall back for head_dim = 80
-    # https://github.com/vllm-project/vllm/issues/14524
-    "qwen_vl",
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
 ]
@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        # TODO: Change back to 2.7b once head_dim = 80 is supported
-        models=["Salesforce/blip2-opt-6.7b"],
+        models=["Salesforce/blip2-opt-2.7b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
    "h2ovl": VLMTestInfo(
        models = [
            "h2oai/h2ovl-mississippi-800m",
-            # TODO: Re-enable once head_dim = 80 is supported
-            # "h2oai/h2ovl-mississippi-2b",
+            "h2oai/h2ovl-mississippi-2b",
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@ -83,7 +83,7 @@ MODELS = [
    QWEN2_CONFIG,
    PHI3_CONFIG,
    GPT2_CONFIG,
-    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    STABLELM_CONFIG,
    DOLPHIN_CONFIG,
    # STARCODER_CONFIG, # broken
 ]
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -169,7 +169,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
                                          min_transformers_version="4.53"),
    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                        trust_remote_code=True),
    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
-    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
+    # Blocksparse attention not supported in V1 yet
    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
                                            trust_remote_code=True,
                                            v0_only=True),
@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
-                                                v0_only=True),
-    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
-                                           v0_only=True),
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"},  # noqa: E501
-                                                     v0_only=True),
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True,
-                                                      v0_only=True),
+                                                      trust_remote_code=True),
    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    model_info.check_transformers_version(on_fail="skip")

    # FIXME: Possible memory leak in the previous tests?
-    if model_arch == "GraniteSpeechForConditionalGeneration":
+    if model_arch in ("GraniteSpeechForConditionalGeneration",
+                      "KimiVLForConditionalGeneration"):
        pytest.skip("Avoid OOM")

    # Avoid OOM and reduce initialization time by only using 1 layer
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright © 2025, Oracle and/or its affiliates.
 """Tests RTN quantization startup and generation, 
 doesn't test correctness
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -20,10 +20,11 @@ from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                        MemorySnapshot, PlaceholderModule, StoreBoolean,
                        bind_kv_cache, common_broadcastable_dtype,
-                        deprecate_kwargs, get_open_port, is_lossless_cast,
-                        make_zmq_path, make_zmq_socket, memory_profiling,
-                        merge_async_iterators, sha256, split_zmq_path,
-                        supports_kw, swap_dict_values)
+                        deprecate_kwargs, get_open_port, get_tcp_uri,
+                        is_lossless_cast, join_host_port, make_zmq_path,
+                        make_zmq_socket, memory_profiling,
+                        merge_async_iterators, sha256, split_host_port,
+                        split_zmq_path, supports_kw, swap_dict_values)

 from .utils import create_new_process_for_each_test, error_on_warning

@ -876,3 +877,44 @@ def test_make_zmq_socket_ipv6():
 def test_make_zmq_path():
    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
+
+
+def test_get_tcp_uri():
+    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
+    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
+
+
+def test_split_host_port():
+    # valid ipv4
+    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
+    # invalid ipv4
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("127.0.0.1::5555")
+    with pytest.raises(ValueError):
+        # tailing colon
+        assert split_host_port("127.0.0.1:5555:")
+    with pytest.raises(ValueError):
+        # no colon
+        assert split_host_port("127.0.0.15555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("127.0.0.1:5555a")
+
+    # valid ipv6
+    assert split_host_port("[::1]:5555") == ("::1", 5555)
+    # invalid ipv6
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("[::1]::5555")
+    with pytest.raises(IndexError):
+        # no colon
+        assert split_host_port("[::1]5555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("[::1]:5555a")
+
+
+def test_join_host_port():
+    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
+    assert join_host_port("::1", 5555) == "[::1]:5555"
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501

 import json
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@ -9,7 +9,7 @@ import torch
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                         SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@ -17,6 +17,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.structured_output.request import StructuredOutputRequest

 EOS_TOKEN_ID = 50256

@ -33,6 +34,7 @@ def create_scheduler(
    block_size: int = 16,
    max_model_len: Optional[int] = None,
    num_speculative_tokens: Optional[int] = None,
+    skip_tokenizer_init: bool = False,
 ) -> Scheduler:
    '''Create scheduler under test.

@ -65,6 +67,7 @@ def create_scheduler(
        trust_remote_code=True,
        dtype="float16",
        seed=42,
+        skip_tokenizer_init=skip_tokenizer_init,
    )
    # Cache config, optionally force APC
    kwargs_cache = ({} if enable_prefix_caching is None else {
@ -186,7 +189,7 @@ def test_get_num_unfinished_requests():
 ])
 def test_schedule(enable_prefix_caching: Optional[bool],
                  prompt_logprobs: Optional[int]):
-    '''Test scheduling. 
+    '''Test scheduling.
    Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
    '''
    scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
@ -1408,7 +1411,7 @@ def create_requests_with_priority(


 def test_priority_scheduling_basic_ordering():
-    """Test that requests are scheduled in priority order 
+    """Test that requests are scheduled in priority order
    (lower value = higher priority)."""
    scheduler = create_scheduler_with_priority()

@ -1437,7 +1440,7 @@ def test_priority_scheduling_basic_ordering():


 def test_priority_scheduling_arrival_time_tiebreaker():
-    """Test that arrival time is used 
+    """Test that arrival time is used
    as tiebreaker when priorities are equal."""
    scheduler = create_scheduler_with_priority()

@ -1495,7 +1498,7 @@ def test_priority_scheduling_mixed_priority_and_arrival():


 def test_priority_scheduling_preemption():
-    """Test that priority scheduling preempts 
+    """Test that priority scheduling preempts
    lower priority requests when memory is constrained."""
    # Create scheduler with very limited memory to force preemption
    scheduler = create_scheduler_with_priority(
@ -1576,7 +1579,7 @@ def test_priority_scheduling_preemption():


 def test_priority_scheduling_no_preemption_when_space_available():
-    """Test that preemption doesn't happen 
+    """Test that preemption doesn't happen
    when there's space for new requests."""
    scheduler = create_scheduler_with_priority(
        max_num_seqs=3,  # Allow 3 concurrent requests
@ -1626,7 +1629,7 @@ def test_priority_scheduling_no_preemption_when_space_available():


 def test_priority_scheduling_preemption_victim_selection():
-    """Test that the correct victim is selected for 
+    """Test that the correct victim is selected for
    preemption based on priority and arrival time."""
    # This test verifies the priority-based victim selection logic
    # by checking the waiting queue order after adding requests with different
@ -1743,7 +1746,7 @@ def test_priority_scheduling_waiting_queue_order():


 def test_priority_scheduling_fcfs_fallback():
-    """Test that FCFS behavior is maintained when all 
+    """Test that FCFS behavior is maintained when all
    requests have same priority."""
    scheduler = create_scheduler_with_priority()

@ -1811,7 +1814,7 @@ def test_priority_scheduling_with_limited_slots():


 def test_priority_scheduling_heap_property():
-    """Test that the waiting queue maintains heap 
+    """Test that the waiting queue maintains heap
    property for priority scheduling."""
    scheduler = create_scheduler_with_priority(
        max_num_seqs=1,  # Only one request can run at a time
@ -1857,3 +1860,39 @@ def test_priority_scheduling_heap_property():
    # Verify requests were scheduled in priority order (lowest value first)
    expected_priorities = sorted(priorities)
    assert scheduled_priorities == expected_priorities
+
+
+def test_schedule_skip_tokenizer_init():
+    scheduler = create_scheduler(skip_tokenizer_init=True)
+    requests = create_requests(num_requests=5)
+    for request in requests:
+        scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.grammar_bitmask is None
+
+
+def test_schedule_skip_tokenizer_init_structured_output_request():
+    scheduler = create_scheduler(skip_tokenizer_init=True)
+    guided_params = GuidedDecodingParams(regex="[0-9]+")
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=16,
+        guided_decoding=guided_params,
+    )
+    request = Request(
+        request_id="0",
+        prompt_token_ids=[0, 1],
+        multi_modal_inputs=None,
+        multi_modal_hashes=None,
+        multi_modal_placeholders=None,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=EOS_TOKEN_ID,
+        structured_output_request=StructuredOutputRequest(sampling_params),
+    )
+    scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@ -1,19 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations

 import random
-from typing import Optional
+from typing import TYPE_CHECKING, Optional

 import pytest

-from vllm import LLM, SamplingParams
+from vllm import LLM
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector

+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
 MODEL = "facebook/opt-125m"
 DTYPE = "half"


-def _vllm_model(apc: bool, vllm_runner, monkeypatch):
+def _vllm_model(
+    apc: bool,
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    *,
+    skip_tokenizer_init: bool = False,
+):
    """Set up VllmRunner instance."""
    monkeypatch.setenv("VLLM_USE_V1", "1")
    return vllm_runner(
@ -23,6 +34,7 @@ def _vllm_model(apc: bool, vllm_runner, monkeypatch):
        enforce_eager=True,
        enable_prefix_caching=apc,
        gpu_memory_utilization=0.5,
+        skip_tokenizer_init=skip_tokenizer_init,
    )


@ -45,9 +57,27 @@ def vllm_model_apc(vllm_runner, monkeypatch):
        yield vllm_model


+@pytest.fixture(
+    # Function scope decouples tests & allows
+    # env var adjustment via monkeypatch
+    scope="function",
+    # Prefix caching
+    params=[False, True])
+def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+    """VllmRunner test fixture with APC."""
+    with _vllm_model(
+            request.param,
+            vllm_runner,
+            monkeypatch,
+            skip_tokenizer_init=True,
+    ) as vllm_model:
+        yield vllm_model
+
+
 def _get_test_sampling_params(
    prompt_list: list[str],
    seed: Optional[int] = 42,
+    structured_outputs: bool = False,
 ) -> tuple[list[SamplingParams], list[int]]:
    """Generate random sampling params for a batch."""

@ -62,14 +92,34 @@ def _get_test_sampling_params(
    n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
    # High temperature to maximize the chance of unique completions
    return [
-        SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed)
-        for n in n_list
+        SamplingParams(
+            temperature=0.95,
+            top_p=0.95,
+            n=n,
+            seed=seed,
+            guided_decoding=GuidedDecodingParams(
+                regex="[0-9]+") if structured_outputs else None,
+        ) for n in n_list
    ], n_list


+def test_compatibility_with_skip_tokenizer_init(
+    vllm_model_skip_tokenizer_init: VllmRunner,
+    example_prompts: list[str],
+):
+    # Case 1: Structured output request should raise an error.
+    sampling_params_list, _ = _get_test_sampling_params(
+        example_prompts,
+        structured_outputs=True,
+    )
+    model: LLM = vllm_model_skip_tokenizer_init.model
+    with pytest.raises(ValueError):
+        _ = model.generate(example_prompts, sampling_params_list)
+
+
 def test_parallel_sampling(vllm_model, example_prompts) -> None:
    """Test passes if parallel sampling `n>1` yields `n` unique completions.
-    
+
    Args:
      vllm_model: VllmRunner instance under test.
      example_prompt: test fixture providing prompts for testing.
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import random
 from collections.abc import Callable
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
    "openai/whisper-large-v3",  # transcription
    "facebook/bart-large-cnn",  # encoder decoder
    "state-spaces/mamba-130m-hf",  # mamba1
-    "hmellor/tiny-random-BambaForCausalLM",  # hybrid
    "BAAI/bge-m3",  # embedding
 ]

--- a/tests/v1/test_request.py
+++ b/tests/v1/test_request.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.request import RequestStatus


--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import tempfile

--- a/tests/v1/tpu/test_tpu_qkv_linear.py
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile

 import numpy as np
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@ -450,6 +450,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):


 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} must come before the current layer"
@ -478,6 +479,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():


 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    invalid_layer = "model.layers.0.cross_attn.attn"
@ -506,6 +508,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():


 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} cannot be the same as the current layer"
@ -534,6 +537,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():


 def test_init_kv_cache_without_kv_sharing():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
@ -601,6 +605,7 @@ def test_init_kv_cache_without_kv_sharing():


 def test_init_kv_cache_with_kv_sharing_valid():
+    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import sys

--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@ -2,51 +2,146 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import sys
+from enum import Enum

-SPDX_HEADER = (
+
+class SPDXStatus(Enum):
+    """SPDX header status enumeration"""
+    EMPTY = "empty"  # empty __init__.py
+    COMPLETE = "complete"
+    MISSING_LICENSE = "missing_license"  # Only has copyright line
+    MISSING_COPYRIGHT = "missing_copyright"  # Only has license line
+    MISSING_BOTH = "missing_both"  # Completely missing
+
+
+FULL_SPDX_HEADER = (
    "# SPDX-License-Identifier: Apache-2.0\n"
    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
-SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
+
+LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0"
+COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"  # noqa: E501


-def check_spdx_header(file_path):
-    with open(file_path, encoding='UTF-8') as file:
+def check_spdx_header_status(file_path):
+    """Check SPDX header status of the file"""
+    with open(file_path, encoding="UTF-8") as file:
        lines = file.readlines()
        if not lines:
-            # Empty file like __init__.py
-            return True
-        for line in lines:
-            if line.strip().startswith(SPDX_HEADER_PREFIX):
-                return True
-    return False
+            # Empty file
+            return SPDXStatus.EMPTY
+
+        # Skip shebang line
+        start_idx = 0
+        if lines and lines[0].startswith("#!"):
+            start_idx = 1
+
+        has_license = False
+        has_copyright = False
+
+        # Check all lines for SPDX headers (not just the first two)
+        for i in range(start_idx, len(lines)):
+            line = lines[i].strip()
+            if line == LICENSE_LINE:
+                has_license = True
+            elif line == COPYRIGHT_LINE:
+                has_copyright = True
+
+        # Determine status based on what we found
+        if has_license and has_copyright:
+            return SPDXStatus.COMPLETE
+        elif has_license and not has_copyright:
+            # Only has license line
+            return SPDXStatus.MISSING_COPYRIGHT
+            # Only has copyright line
+        elif not has_license and has_copyright:
+            return SPDXStatus.MISSING_LICENSE
+        else:
+            # Completely missing both lines
+            return SPDXStatus.MISSING_BOTH


-def add_header(file_path):
-    with open(file_path, 'r+', encoding='UTF-8') as file:
+def add_header(file_path, status):
+    """Add or supplement SPDX header based on status"""
+    with open(file_path, "r+", encoding="UTF-8") as file:
        lines = file.readlines()
        file.seek(0, 0)
-        if lines and lines[0].startswith("#!"):
-            file.write(lines[0])
-            file.write(SPDX_HEADER + '\n')
-            file.writelines(lines[1:])
-        else:
-            file.write(SPDX_HEADER + '\n')
+        file.truncate()
+
+        if status == SPDXStatus.MISSING_BOTH:
+            # Completely missing, add complete header
+            if lines and lines[0].startswith("#!"):
+                # Preserve shebang line
+                file.write(lines[0])
+                file.write(FULL_SPDX_HEADER + "\n")
+                file.writelines(lines[1:])
+            else:
+                # Add header directly
+                file.write(FULL_SPDX_HEADER + "\n")
+                file.writelines(lines)
+
+        elif status == SPDXStatus.MISSING_COPYRIGHT:
+            # Only has license line, need to add copyright line
+            # Find the license line and add copyright line after it
+            for i, line in enumerate(lines):
+                if line.strip() == LICENSE_LINE:
+                    # Insert copyright line after license line
+                    lines.insert(
+                        i + 1,
+                        f"{COPYRIGHT_LINE}\n",
+                    )
+                    break
+
+            file.writelines(lines)
+
+        elif status == SPDXStatus.MISSING_LICENSE:
+            # Only has copyright line, need to add license line
+            # Find the copyright line and add license line before it
+            for i, line in enumerate(lines):
+                if line.strip() == COPYRIGHT_LINE:
+                    # Insert license line before copyright line
+                    lines.insert(i, f"{LICENSE_LINE}\n")
+                    break
            file.writelines(lines)


 def main():
-    files_with_missing_header = []
+    """Main function"""
+    files_missing_both = []
+    files_missing_copyright = []
+    files_missing_license = []
+
    for file_path in sys.argv[1:]:
-        if not check_spdx_header(file_path):
-            files_with_missing_header.append(file_path)
+        status = check_spdx_header_status(file_path)

-    if files_with_missing_header:
+        if status == SPDXStatus.MISSING_BOTH:
+            files_missing_both.append(file_path)
+        elif status == SPDXStatus.MISSING_COPYRIGHT:
+            files_missing_copyright.append(file_path)
+        elif status == SPDXStatus.MISSING_LICENSE:
+            files_missing_license.append(file_path)
+        else:
+            continue
+
+    # Collect all files that need fixing
+    all_files_to_fix = (files_missing_both + files_missing_copyright +
+                        files_missing_license)
+    if all_files_to_fix:
        print("The following files are missing the SPDX header:")
-        for file_path in files_with_missing_header:
-            print(f"  {file_path}")
-            add_header(file_path)
+        if files_missing_both:
+            for file_path in files_missing_both:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_BOTH)

-    sys.exit(1 if files_with_missing_header else 0)
+        if files_missing_copyright:
+            for file_path in files_missing_copyright:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_COPYRIGHT)
+        if files_missing_license:
+            for file_path in files_missing_license:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_LICENSE)
+
+    sys.exit(1 if all_files_to_fix else 0)


 if __name__ == "__main__":
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -646,6 +646,20 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
    return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)


+def cutlass_blockwise_scaled_grouped_mm(
+    output: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scales_a: torch.Tensor,
+    scales_b: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    expert_offsets: torch.Tensor,
+):
+    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a,
+                                                     scales_b, problem_sizes,
+                                                     expert_offsets)
+
+
 def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
                          block_scale_a: torch.Tensor,
                          block_scale_b: torch.Tensor, alpha: torch.Tensor,
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -310,7 +310,8 @@ class MultiHeadAttention(nn.Module):
            # currently, only torch_sdpa is supported on rocm
            self.attn_backend = _Backend.TORCH_SDPA
        else:
-            if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
+                           _Backend.FLEX_ATTENTION):
                backend = _Backend.XFORMERS

            self.attn_backend = backend if backend in {
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@ -4,7 +4,7 @@
 import os
 from contextlib import contextmanager
 from functools import cache
-from typing import Generator, Optional, Type
+from typing import Generator, Optional, Union

 import torch

@ -79,6 +79,33 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
    return forced_attn_backend


+def supports_head_size(
+    attn_backend: Union[str, type[AttentionBackend]],
+    head_size: int,
+) -> bool:
+    if isinstance(attn_backend, str):
+        try:
+            attn_backend = resolve_obj_by_qualname(attn_backend)
+        except ImportError:
+            return False
+
+    assert isinstance(attn_backend, type)
+
+    # TODO: Update the interface once V0 is removed
+    if get_supported_head_sizes := getattr(attn_backend,
+                                           "get_supported_head_sizes", None):
+        return head_size in get_supported_head_sizes()
+    if validate_head_size := getattr(attn_backend, "validate_head_size", None):
+        try:
+            validate_head_size(head_size)
+            return True
+        except Exception:
+            return False
+
+    raise NotImplementedError(f"{attn_backend.__name__} does not support "
+                              "head size validation")
+
+
 def get_attn_backend(
    head_size: int,
    dtype: torch.dtype,
@ -87,7 +114,7 @@ def get_attn_backend(
    is_attention_free: bool,
    is_blocksparse: bool = False,
    use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""
    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
    # value to be returned from the cache if the value changes between calls.
@ -115,7 +142,7 @@ def _cached_get_attn_backend(
    is_blocksparse: bool = False,
    use_v1: bool = False,
    use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
    if is_blocksparse:
        logger.info("Using BlocksparseFlashAttention backend.")
        from vllm.attention.backends.blocksparse_attn import (
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import torch
 import torch._inductor.pattern_matcher as pm
--- a/vllm/config.py
+++ b/vllm/config.py
@ -466,6 +466,9 @@ class ModelConfig:
                    "affect the random state of the Python process that "
                    "launched vLLM.", self.seed)

+        # Keep set served_model_name before maybe_model_redirect(self.model)
+        self.served_model_name = get_served_model_name(self.model,
+                                                       self.served_model_name)
        self.model = maybe_model_redirect(self.model)
        # The tokenizer is consistent with the model by default.
        if self.tokenizer is None:
@ -609,8 +612,6 @@ class ModelConfig:

        self.original_max_model_len = self.max_model_len
        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
-        self.served_model_name = get_served_model_name(self.model,
-                                                       self.served_model_name)
        self.multimodal_config = self._init_multimodal_config()
        if not self.skip_tokenizer_init:
            self._verify_tokenizer_mode()
@ -1420,7 +1421,7 @@ class ModelConfig:

    @property
    def is_cross_encoder(self) -> bool:
-        return self.registry.is_cross_encoder_model(self.architectures)
+        return self.task == "classify"

    @property
    def use_mla(self) -> bool:
@ -2318,7 +2319,7 @@ class SchedulerConfig:

        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
            logger.warning(
-                "max_num_batched_tokens (%d) exceeds max_num_seqs"
+                "max_num_batched_tokens (%d) exceeds max_num_seqs "
                "* max_model_len (%d). This may lead to unexpected behavior.",
                self.max_num_batched_tokens,
                self.max_num_seqs * self.max_model_len)
@ -4762,6 +4763,12 @@ class VllmConfig:
        if cls is not None:
            cls.verify_and_update_config(self)

+        if self.model_config.task == "classify":
+            # Maybe convert ForCausalLM into ForSequenceClassification model.
+            from vllm.model_executor.models.adapters import (
+                SequenceClassificationConfig)
+            SequenceClassificationConfig.verify_and_update_config(self)
+
    def __str__(self):
        return (
            f"model={self.model_config.model!r},"
--- a/vllm/distributed/eplb/init.py
+++ b/vllm/distributed/eplb/init.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''
 Expert parallelism load balancer (EPLB).
 '''
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) metrics and states.

--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) for vLLM.

--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 The actual execution of the rearrangement.

--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@ -97,10 +97,10 @@ def get_kv_connector_cache_layout():
    # used for faster transfer.
    vllm_config = get_current_vllm_config()
    kv_config = vllm_config.kv_transfer_config
-    if vllm_config.model_config is None or kv_config is None:
+    if kv_config is not None and vllm_config.model_config is None:
        logger.warning_once("Unable to detect current VLLM config. " \
        "Defaulting to NHD kv cache layout.")
-    else:
+    elif kv_config is not None:
        use_mla = vllm_config.model_config.use_mla
        if not use_mla and kv_config.kv_connector == "NixlConnector":
            logger.info_once("NixlConnector detected. Setting KV cache " \
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import logging
 import os
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import atexit
 import ctypes
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@ -16,6 +16,7 @@ from safetensors.torch import save as safetensors_save
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
+from vllm.utils import join_host_port, make_zmq_path, split_host_port

 logger = init_logger(__name__)
 NONE_INT = -150886311
@ -79,18 +80,19 @@ class MooncakeTransferEngine:
            logger.error(
                "An error occurred while loading the configuration: %s", exc)
            raise
-        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
-        decode_host, base_decode_port = self.config.decode_url.split(':')
+        prefill_host, base_prefill_port = split_host_port(
+            self.config.prefill_url)
+        decode_host, base_decode_port = split_host_port(self.config.decode_url)

        # Avoid ports conflict when running prefill and decode on the same node
        if prefill_host == decode_host and \
                base_prefill_port == base_decode_port:
-            base_decode_port = str(int(base_decode_port) + 100)
+            base_decode_port = base_decode_port + 100

-        prefill_port = int(base_prefill_port) + self.local_rank
-        decode_port = int(base_decode_port) + self.local_rank
-        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
-        self.decode_url = ':'.join([decode_host, str(decode_port)])
+        prefill_port = base_prefill_port + self.local_rank
+        decode_port = base_decode_port + self.local_rank
+        self.prefill_url = join_host_port(prefill_host, prefill_port)
+        self.decode_url = join_host_port(decode_host, decode_port)

        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
                        self.config.metadata_server, self.config.protocol,
@ -110,22 +112,30 @@ class MooncakeTransferEngine:
        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
                                     decode_host, base_decode_port)

-    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
-                                d_host: str, d_port: str) -> None:
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: int,
+                                d_host: str, d_port: int) -> None:
        """Set up ZeroMQ sockets for sending and receiving data."""
        # Offsets < 8 are left for initialization in case tp and pp are enabled
-        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
-        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
+        p_rank_offset = p_port + 8 + self.local_rank * 2
+        d_rank_offset = d_port + 8 + self.local_rank * 2
        if kv_rank == 0:
-            self.sender_socket.bind(f"tcp://{p_host}:{p_rank_offset + 1}")
-            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
-            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
-            self.receiver_ack.bind(f"tcp://{p_host}:{p_rank_offset + 2}")
+            self.sender_socket.bind(
+                make_zmq_path("tcp", p_host, p_rank_offset + 1))
+            self.receiver_socket.connect(
+                make_zmq_path("tcp", d_host, d_rank_offset + 1))
+            self.sender_ack.connect(
+                make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.receiver_ack.bind(
+                make_zmq_path("tcp", p_host, p_rank_offset + 2))
        else:
-            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
-            self.sender_socket.bind(f"tcp://{d_host}:{d_rank_offset + 1}")
-            self.receiver_ack.bind(f"tcp://{d_host}:{d_rank_offset + 2}")
-            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
+            self.receiver_socket.connect(
+                make_zmq_path("tcp", p_host, p_rank_offset + 1))
+            self.sender_socket.bind(
+                make_zmq_path("tcp", d_host, d_rank_offset + 1))
+            self.receiver_ack.bind(
+                make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.sender_ack.connect(
+                make_zmq_path("tcp", p_host, p_rank_offset + 2))

    def initialize(self, local_hostname: str, metadata_server: str,
                   protocol: str, device_name: str,
--- a/vllm/distributed/tpu_distributed_utils.py
+++ b/vllm/distributed/tpu_distributed_utils.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import OrderedDict
 from typing import Optional

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1393,13 +1393,6 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False

-        # Only Fp16 and Bf16 dtypes since we only support FA.
-        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
-        if model_config.dtype not in V1_SUPPORTED_DTYPES:
-            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
-                               recommend_to_remove=False)
-            return False
-
        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -28,7 +28,8 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
                               ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
    InputAudio)
-from pydantic import TypeAdapter
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                          ProcessorMixin)
@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
    """The type of the content part."""


+class PILImage(BaseModel):
+    """
+    A PIL.Image.Image object.
+    """
+    image_pil: Image.Image
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a PIL image.
+
+    Example:
+    {
+        "image_pil": ImageAsset('cherry_blossom').pil_image
+    }
+    """
+    image_pil: Required[PILImage]
+
+
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.
@ -129,6 +149,7 @@ ChatCompletionContentPartParam: TypeAlias = Union[
    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
    ChatCompletionContentPartInputAudioParam,
    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    CustomChatCompletionContentPILImageParam,
    CustomChatCompletionContentSimpleImageParam,
    ChatCompletionContentPartImageEmbedsParam,
    CustomChatCompletionContentSimpleAudioParam,
@ -631,6 +652,10 @@ class BaseMultiModalContentParser(ABC):
                           image_embeds: Union[str, dict[str, str]]) -> None:
        raise NotImplementedError

+    @abstractmethod
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        raise NotImplementedError
+
    @abstractmethod
    def parse_audio(self, audio_url: str) -> None:
        raise NotImplementedError
@ -677,6 +702,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):

        self._add_placeholder(placeholder)

+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        placeholder = self._tracker.add("image", image_pil)
+        self._add_placeholder(placeholder)
+
    def parse_audio(self, audio_url: str) -> None:
        audio = self._connector.fetch_audio(audio_url)

@ -733,6 +762,13 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
        placeholder = self._tracker.add("image_embeds", future)
        self._add_placeholder(placeholder)

+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        future: asyncio.Future[Image.Image] = asyncio.Future()
+        future.set_result(image_pil)
+
+        placeholder = self._tracker.add("image", future)
+        self._add_placeholder(placeholder)
+
    def parse_audio(self, audio_url: str) -> None:
        audio_coro = self._connector.fetch_audio_async(audio_url)

@ -851,12 +887,13 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
+_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
 # Need to validate url objects
 _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python

-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]

 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@ -869,6 +906,7 @@ MM_PARSER_MAP: dict[
    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
    "image_embeds":
    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
    "audio_url":
    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
    "input_audio":
@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(


 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds",
+                                       "image_embeds", "image_pil",
                                       "audio_url", "input_audio", "video_url")


@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
        else:
            return str_content

+    if part_type == "image_pil":
+        image_content = cast(Image.Image, content)
+        mm_parser.parse_image_pil(image_content)
+        return {'type': 'image'} if wrap_dicts else None
    if part_type == "image_url":
        str_content = cast(str, content)
        mm_parser.parse_image(str_content)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -1204,7 +1204,7 @@ class LLM:

        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]

-        pooling_params = PoolingParams()
+        pooling_params = PoolingParams(use_cross_encoder=True)

        tokenization_kwargs: dict[str, Any] = {}
        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -910,6 +910,8 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 }

 if envs.VLLM_SERVER_DEV_MODE:
+    logger.warning("SECURITY WARNING: Development endpoints are enabled! "
+                   "This should NOT be used in production!")

    @router.get("/server_info")
    async def show_server_info(raw_request: Request):
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -229,7 +229,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
    logit_bias: Optional[dict[str, float]] = None
    logprobs: Optional[bool] = False
    top_logprobs: Optional[int] = 0
-    # TODO(#9845): remove max_tokens when field is removed from OpenAI API
    max_tokens: Optional[int] = Field(
        default=None,
        deprecated=
@ -433,23 +432,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
    }

    def to_beam_search_params(
-            self,
-            default_max_tokens: int,
-            default_sampling_params: Optional[dict] = None
-    ) -> BeamSearchParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
-        max_tokens = self.max_completion_tokens or self.max_tokens
+            self, max_tokens: int,
+            default_sampling_params: dict) -> BeamSearchParams:

-        if default_sampling_params is None:
-            default_sampling_params = {}
        n = self.n if self.n is not None else 1
-
-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)
-
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
@ -465,21 +451,10 @@ class ChatCompletionRequest(OpenAIBaseModel):

    def to_sampling_params(
        self,
-        default_max_tokens: int,
+        max_tokens: int,
        logits_processor_pattern: Optional[str],
-        default_sampling_params: Optional[dict] = None,
+        default_sampling_params: dict,
    ) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
-        max_tokens = self.max_completion_tokens or self.max_tokens
-
-        if default_sampling_params is None:
-            default_sampling_params = {}
-
-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)

        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
@ -898,22 +873,15 @@ class CompletionRequest(OpenAIBaseModel):
    }

    def to_beam_search_params(
-            self,
-            default_max_tokens: int,
-            default_sampling_params: Optional[dict] = None
+        self,
+        max_tokens: int,
+        default_sampling_params: Optional[dict] = None,
    ) -> BeamSearchParams:
-        max_tokens = self.max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}
        n = self.n if self.n is not None else 1

-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)
-
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get("temperature", 1.0)

@ -928,21 +896,14 @@ class CompletionRequest(OpenAIBaseModel):

    def to_sampling_params(
        self,
-        default_max_tokens: int,
+        max_tokens: int,
        logits_processor_pattern: Optional[str],
        default_sampling_params: Optional[dict] = None,
    ) -> SamplingParams:
-        max_tokens = self.max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}

-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)
-
        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
@ -1195,8 +1156,9 @@ class ScoreRequest(OpenAIBaseModel):

    # --8<-- [end:score-extra-params]

-    def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
+                             additional_data=self.additional_data)


 class RerankRequest(OpenAIBaseModel):
@ -1221,8 +1183,9 @@ class RerankRequest(OpenAIBaseModel):

    # --8<-- [end:rerank-extra-params]

-    def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
+                             additional_data=self.additional_data)


 class RerankDocument(BaseModel):
@ -1813,7 +1776,7 @@ class TranscriptionRequest(OpenAIBaseModel):
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+
        max_tokens = default_max_tokens

        if default_sampling_params is None:
@ -2029,7 +1992,7 @@ class TranslationRequest(OpenAIBaseModel):
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+
        max_tokens = default_max_tokens

        if default_sampling_params is None:
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -34,6 +34,7 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
    MistralToolCall)
+from vllm.entrypoints.utils import get_max_tokens
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@ -233,15 +234,22 @@ class OpenAIServingChat(OpenAIServing):
        try:
            for i, engine_prompt in enumerate(engine_prompts):
                sampling_params: Union[SamplingParams, BeamSearchParams]
-                default_max_tokens = self.max_model_len - len(
-                    engine_prompt["prompt_token_ids"])
+
+                if self.default_sampling_params is None:
+                    self.default_sampling_params = {}
+
+                max_tokens = get_max_tokens(
+                    max_model_len=self.max_model_len,
+                    request=request,
+                    input_length=len(engine_prompt["prompt_token_ids"]),
+                    default_sampling_params=self.default_sampling_params)
+
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
-                        default_max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params)
                else:
                    sampling_params = request.to_sampling_params(
-                        default_max_tokens,
-                        self.model_config.logits_processor_pattern,
+                        max_tokens, self.model_config.logits_processor_pattern,
                        self.default_sampling_params)

                self._log_inputs(request_id,
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -33,6 +33,7 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                    is_text_tokens_prompt)
 # yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
                              is_tokens_prompt)
 from vllm.logger import init_logger
@ -160,15 +161,22 @@ class OpenAIServingCompletion(OpenAIServing):
                    input_length = len(engine_prompt["prompt_token_ids"])
                else:
                    assert_never(engine_prompt)
-                default_max_tokens = self.max_model_len - input_length
+
+                if self.default_sampling_params is None:
+                    self.default_sampling_params = {}
+
+                max_tokens = get_max_tokens(
+                    max_model_len=self.max_model_len,
+                    request=request,
+                    input_length=input_length,
+                    default_sampling_params=self.default_sampling_params)

                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
-                        default_max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params)
                else:
                    sampling_params = request.to_sampling_params(
-                        default_max_tokens,
-                        self.model_config.logits_processor_pattern,
+                        max_tokens, self.model_config.logits_processor_pattern,
                        self.default_sampling_params)

                request_id_item = f"{request_id}-{i}"
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@ -25,9 +25,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               PreTrainedTokenizer,
-                                               PreTrainedTokenizerFast)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators

 logger = init_logger(__name__)
@ -50,7 +48,7 @@ class ServingScores(OpenAIServing):

    async def _embedding_score(
        self,
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        tokenizer: AnyTokenizer,
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@ -141,7 +139,7 @@ class ServingScores(OpenAIServing):

    async def _cross_encoding_score(
        self,
-        tokenizer: Union[AnyTokenizer],
+        tokenizer: AnyTokenizer,
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@ -190,7 +188,7 @@ class ServingScores(OpenAIServing):
        # Schedule the request and get the result generator.
        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []

-        pooling_params = request.to_pooling_params()
+        pooling_params = request.to_pooling_params(use_cross_encoder=True)

        for i, engine_prompt in enumerate(engine_prompts):
            request_id_item = f"{request_id}-{i}"
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@ -6,6 +6,7 @@ from typing import Union

 import regex as re

+from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@ -15,7 +16,6 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
    ToolParser, ToolParserManager)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import random_uuid

 logger = init_logger(__name__)

@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
                        DeltaToolCall(
                            index=self.current_tool_id,
                            type="function",
-                            id=f"chatcmpl-tool-{random_uuid()}",
+                            id=random_tool_call_id(),
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 import json
 from collections.abc import Sequence
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union

 import regex as re

+from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@ -224,7 +226,7 @@ class xLAMToolParser(ToolParser):
                        function_name = name_match.group(1)

                        # The test expects us to send just the name first
-                        tool_id = f"chatcmpl-tool-{random_uuid()}"
+                        tool_id = random_tool_call_id()
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=0,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
simon-mo	a5dd03c1eb	Revert "[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 )" This reverts commit e202dd2736bc575b11250b15311512d19d3225d5.	2025-07-06 14:02:36 -07:00
Cyrus Leung	c18b3b8e8b	[Bugfix] Add `use_cross_encoder` flag to use correct activation in `ClassifierPooler` (#20527 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-06 14:01:48 -07:00
Woosuk Kwon	9528e3a05e	[BugFix][Spec Decode] Fix spec token ids in model runner (#20530 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-07-06 19:44:52 +00:00
Cyrus Leung	9fb52e523a	[V1] Support any head size for FlexAttention backend (#20467 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-06 09:54:36 -07:00
Woosuk Kwon	e202dd2736	[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com>	2025-07-06 08:48:13 -07:00
Reid	43813e6361	[Misc] call the pre-defined func (#20518 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-06 10:25:29 +00:00
Brayden Zhong	cede942b87	[Benchmark] Add support for multiple batch size benchmark through CLI in `benchmark_moe.py` (#20516 ) Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>	2025-07-06 09:20:11 +00:00
Flora Feng	fe1e924811	[Frontend] Support image object in llm.chat (#19635 ) Signed-off-by: sfeng33 <4florafeng@gmail.com> Signed-off-by: Flora Feng <4florafeng@gmail.com>	2025-07-06 06:47:13 +00:00
Chengji Yao	4548c03c50	[TPU][Bugfix] fix the MoE OOM issue (#20339 ) Signed-off-by: Chengji Yao <chengjiyao@google.com>	2025-07-05 21:19:09 -07:00
Lucas Wilkinson	40b86aa05e	[BugFix] Fix: ImportError when building on hopper systems (#20513 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-07-06 12:17:30 +08:00
Lucia Fang	432870829d	[Bugfix] Fix missing per_act_token parameter in compressed_tensors_moe (#20509 ) Signed-off-by: Lu Fang <fanglu@fb.com>	2025-07-06 12:08:30 +08:00
Vadim Gimpelson	f73d02aadc	[BUG] Fix #20484 . Support empty sequence in cuda penalty kernel (#20491 ) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>	2025-07-05 19:38:02 -07:00
Jeremy Reizenstein	c5ebe040ac	test_attention compat with coming xformers change (#20487 ) Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-07-05 19:37:59 -07:00
Reid	8d763cb891	[Misc] remove unused import (#20517 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-05 19:17:06 -07:00
Reid	cf4cd53982	[Misc] Add logger.exception for TPU information collection failures (#20510 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-05 07:24:32 -07:00
Isotr0py	32c9be2200	[v1] Re-add fp32 support to v1 engine through FlexAttention (#19754 ) Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-07-05 09:41:10 +00:00
Lucia Fang	8aeaa910a2	Fix unknown attribute of topk_indices_dtype in CompressedTensorsW8A8Fp8MoECutlassMethod (#20507 ) Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>	2025-07-05 14:03:20 +08:00
Jee Jee Li	906e05d840	[Misc] Remove the unused LoRA test code (#20494 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-07-05 13:48:16 +08:00
Reid	ef9a2990ae	[doc] small fix (#20506 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-04 20:56:39 -07:00
Reid	7e90870491	[Misc] Add security warning for development mode endpoints (#20508 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-04 20:52:13 -07:00
Guy Stone	d3f05c9248	[Doc] fix mutltimodal_inputs.md gh examples link (#20497 ) Signed-off-by: Guy Stone <guys@spotify.com>	2025-07-04 16:41:35 -07:00
Michael Goin	c108781c85	[CI Bugfix] Fix pre-commit failures on main (#20502 )	2025-07-04 14:17:30 -07:00
Duncan Moss	3d184b95b8	[feat]: CUTLASS block scaled group gemm for SM100 (#19757 ) Signed-off-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Duncan Moss <dmoss@nvidia.com>	2025-07-04 12:58:04 -06:00
Thomas Parnell	2f35a022e6	Enable V1 for Hybrid SSM/Attention Models (#20016 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com>	2025-07-04 17:46:53 +00:00
Chenheli Hua	ffe00ef77a	[Misc] Small: Remove global media connector. Each test should have its own test connector object. (#20395 ) Signed-off-by: Chenheli Hua <huachenheli@outlook.com>	2025-07-04 08:15:03 -07:00
Peter Pan	5561681d04	[CI] add kvcache-connector dependency definition and add into CI build (#18193 ) Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>	2025-07-04 06:49:18 -07:00
Cyrus Leung	fbd62d8750	[Doc] Fix classification table in list of supported models (#20489 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-04 06:08:02 -07:00
wang.yuqi	2e26f9156a	[Model][3/N] Automatic conversion of CrossEncoding model (#20168 ) Signed-off-by: wang.yuqi <noooop@126.com>	2025-07-04 05:47:39 -07:00
sangbumlikeagod	9e5452ee34	[Bug][Frontend] Fix structure of transcription's decoder_prompt (#18809 ) Signed-off-by: sangbumlikeagod <oironese@naver.com>	2025-07-04 11:28:07 +00:00
Michael Goin	0e3fe896e2	Support Llama 4 for fused_marlin_moe (#20457 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-07-04 07:55:10 +00:00
Jee Jee Li	1caca5a589	[Misc] Add SPDX-FileCopyrightText (#20428 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-07-04 07:40:42 +00:00
Wentao Ye	783921d889	[Perf] Optimize Vectorization Utils for Int 8 Quantization Kernels (#20331 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-07-04 15:06:24 +08:00
Aaron Pham	4a98edff1f	[Structured Outputs][V1] Skipping with models doesn't contain tokenizers (#20365 ) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Nick Hill <nhill@redhat.com>	2025-07-04 15:05:49 +08:00
Reid	a7bab0c9e5	[Misc] small update (#20462 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-03 20:33:44 -07:00
汪志鹏	25950dca9b	Add ignore consolidated file in mistral example code (#20420 ) Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>	2025-07-04 02:55:07 +00:00
Gabriel Marinho	a4113b035c	[Platform] Add custom default max tokens (#18557 ) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>	2025-07-04 10:50:17 +08:00
Michael Goin	7e1665b089	[Misc] Change warn_for_unimplemented_methods to debug (#20455 )	2025-07-04 02:35:08 +00:00
Seiji Eicher	8d1096e7db	[Bugfix] Register reducer even if transformers_modules not available (#19510 ) Signed-off-by: Seiji Eicher <seiji@anyscale.com>	2025-07-03 22:08:12 +00:00
Nicolò Lucchesi	8d775dd30a	[Misc] Fix `Unable to detect current VLLM config. Defaulting to NHD kv cache layout` warning (#20400 ) Signed-off-by: NickLucche <nlucches@redhat.com>	2025-07-03 14:56:09 -07:00
bnellnm	78fe77534b	[Kernel] Enable fp8 support for pplx and BatchedTritonExperts. (#18864 ) Signed-off-by: Bill Nell <bnell@redhat.com>	2025-07-03 14:55:40 -07:00
Yuxuan Zhang	2f2fcb31b8	[Misc] Remove _maybe_ignore_quant_config from GLM4.1v (#20432 ) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>	2025-07-03 21:41:13 +00:00
Ning Xie	1dba2c4ebe	[Misc] adjust for ipv6 for mookcacke url parse (#20107 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-07-03 20:27:17 +00:00
Isotr0py	71d6de3a26	[Misc] Clean up InternVL family config registration (#19992 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-07-03 20:01:47 +00:00