updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
updated
2025-10-20 23:03:52 +08:00 · 2025-03-24 01:00:20 +00:00 · 2025-03-24 00:46:46 +00:00 · 2025-03-23 23:03:42 +00:00 · 2025-03-23 22:56:53 +00:00 · 2025-03-23 22:50:20 +00:00
321 changed files with 16637 additions and 6407 deletions
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -361,7 +361,7 @@ main() {
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOG_LEVEL="WARNING"
+  export VLLM_LOGGING_LEVEL="WARNING"

  # prepare for benchmarking
  cd benchmarks || exit 1
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -82,7 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,16 +0,0 @@
-#!/bin/bash
-
-# This script build the OpenVINO docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t openvino-test -f Dockerfile.openvino .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f openvino-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -15,13 +15,24 @@ remove_docker_container
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && export VLLM_USE_V1=1 \
+    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo TEST_1 \
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+    && echo TEST_2 \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && echo TEST_3 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && echo TEST_4 \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && echo TEST_5 \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+
+
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -12,10 +12,11 @@ docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
+  docker rm -f "${container_name}" || true; 
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
-remove_docker_container

 # Run the image and test offline inference/tensor parallel
 docker run \
@ -25,6 +26,6 @@ docker run \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -136,6 +136,10 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  commands:
+  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
@ -295,6 +299,7 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/test_pass_manager.py

 - label: PyTorch Fullgraph Test # 18min
  source_file_dependencies:
@ -511,8 +516,6 @@ steps:
  - entrypoints/llm/test_collective_rpc.py
  commands:
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
-  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -1,28 +0,0 @@
-name: 🎲 Misc/random discussions that do not fit into the above categories.
-description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
-title: "[Misc]: "
-labels: ["misc"]
-
-body:
- type: markdown
-  attributes:
-    value: >
-      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
-  attributes:
-    label: Anything you want to discuss about vllm.
-    description: >
-      Anything you want to discuss about vllm.
-  validations:
-    required: true
- type: markdown
-  attributes:
-    value: >
-      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1 +1,5 @@
 blank_issues_enabled: false
+contact_links:
+  - name: Questions
+    url: https://discuss.vllm.ai
+    about: Ask questions and discuss with other vLLM community members
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -1,29 +0,0 @@
-# The vLLM Dockerfile is used to construct vLLM image that can be directly used
-# to run the OpenAI compatible server.
-
-FROM ubuntu:22.04 AS dev
-
-RUN apt-get update -y && \
-    apt-get install -y \
-        git python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
-WORKDIR /workspace
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN python3 -m pip install -U pip
-# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
-# build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
-
-COPY examples/ /workspace/examples
-COPY benchmarks/ /workspace/benchmarks
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-CMD ["/bin/bash"]
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@ -12,6 +12,8 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="b7d29fb"
 ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base

@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl

+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+
 ARG BASE_IMAGE
 ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
+    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,11 +1,7 @@
-FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
+# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base

-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+RUN rm /etc/apt/sources.list.d/intel-graphics.list

 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@ -21,8 +17,6 @@ RUN apt-get update -y && \
    python3 \
    python3-dev \
    python3-pip \
-    libze-intel-gpu-dev \
-    libze-intel-gpu1 \
    wget

 WORKDIR /workspace/vllm
--- a/README.md
+++ b/README.md
@ -10,29 +10,25 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

+---
+
+[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
+
+[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
+
+---
+
 *Latest News* 🔥

- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!

 ---

@ -143,10 +139,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Contact Us

- For technical questions and feature requests, please use GitHub issues or discussions.
- For discussing with fellow users and coordinating contributions and development, please use Slack.
- For security disclosures, please use GitHub's security advisory feature.
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)

 ## Media Kit

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -42,7 +42,7 @@ become available.
    </tr>
    <tr>
      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">🟡</td>
      <td style="text-align: center;">🟡</td>
      <td>Specify your dataset path on HuggingFace</td>
    </tr>
@ -60,8 +60,8 @@ become available.
 🚧: to be supported

 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
-formats, please consider contributing.
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.

 **Note**: VisionArena’s `dataset-name` should be set to `hf`

@ -139,6 +139,57 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts "${NUM_PROMPTS}"
 ```

+### HuggingFaceDataset Examples
+
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-subset "${DATASET_SUBSET}"
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+```
+
 ---
 ## Example - Offline Throughput Benchmark

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -63,7 +63,7 @@ async def async_request_tgi(
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
-            # TGI does not accept ignore_eos flag.
+            "ignore_eos_token": request_func_input.ignore_eos,
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -71,6 +71,10 @@ async def async_request_tgi(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None

        ttft = 0.0
        st = time.perf_counter()
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -17,6 +17,7 @@ SampleRequest instances, similar to the approach used in ShareGPT.
 import base64
 import io
 import json
+import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
@ -35,6 +36,8 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer

+logger = logging.getLogger(__name__)
+
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@ -61,9 +64,6 @@ class SampleRequest:
 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0

-    # num_requests has default 1000 in both the benchmark_serving.py and
-    # benchmark_throughput.py
-
    def __init__(
        self,
        dataset_path: Optional[str] = None,
@ -90,8 +90,8 @@ class BenchmarkDataset(ABC):
            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
-        This method is used for chat models that expect a specific 
-        conversation format.
+        This method is used for chat models that expect a specific conversation
+        format.
        """
        content = [{"text": prompt, "type": "text"}]
        if mm_content is not None:
@ -175,6 +175,24 @@ class BenchmarkDataset(ABC):
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+

 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@ -276,15 +294,16 @@ class RandomDataset(BenchmarkDataset):
    ) -> None:
        super().__init__(**kwargs)

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        prefix_len: int = DEFAULT_PREFIX_LEN,
        range_ratio: float = DEFAULT_RANGE_RATIO,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
-               **kwargs) -> list[SampleRequest]:
-
+        **kwargs,
+    ) -> list[SampleRequest]:
        vocab_size = tokenizer.vocab_size

        prefix_token_ids = (np.random.randint(
@ -346,20 +365,24 @@ class ShareGPTDataset(BenchmarkDataset):
        random.seed(self.random_seed)
        random.shuffle(self.data)

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+        **kwargs,
+    ) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
-            prompt, completion = entry["conversations"][0]["value"],\
-                entry["conversations"][1]["value"]
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )

            lora_request, tokenizer = self.get_random_lora_request(
                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
@ -383,6 +406,7 @@ class ShareGPTDataset(BenchmarkDataset):
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
                ))
+        self.maybe_oversample_requests(samples, num_requests)
        return samples


@ -415,19 +439,20 @@ class SonnetDataset(BenchmarkDataset):
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = f.readlines()

-    def sample(self,
+    def sample(
+        self,
        tokenizer,
        num_requests: int,
        prefix_len: int = DEFAULT_PREFIX_LEN,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
        return_prompt_formatted: bool = False,
-               **kwargs) -> list:
+        **kwargs,
+    ) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
        avg_len = sum(len(tokens)
-                      for tokens in \
-                        tokenized_lines) / len(tokenized_lines)
+                      for tokens in tokenized_lines) / len(tokenized_lines)

        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
@ -506,12 +531,14 @@ class BurstGPTDataset(BenchmarkDataset):
        # Convert the dataframe to a list of lists.
        return data.values.tolist()

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        max_loras: Optional[int] = None,
        lora_path: Optional[str] = None,
-               **kwargs) -> list[SampleRequest]:
+        **kwargs,
+    ) -> list[SampleRequest]:
        samples = []
        data = self._sample_loaded_data(num_requests=num_requests)
        for i in range(num_requests):
@ -544,7 +571,6 @@ class HuggingFaceDataset(BenchmarkDataset):
    Dataset class for processing a HuggingFace dataset with conversation data
    and optional images.
    """
-    DEFAULT_NUM_REQUESTS = 1000

    def __init__(
        self,
@ -618,6 +644,7 @@ class HuggingFaceDataset(BenchmarkDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -632,7 +659,6 @@ class VisionArenaDataset(HuggingFaceDataset):
    """

    DEFAULT_OUTPUT_LEN = 128
-    DEFAULT_NUM_REQUESTS = 1000
    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"

    def __init__(
@ -657,12 +683,14 @@ class VisionArenaDataset(HuggingFaceDataset):
        )
        self.data = dataset.shuffle(seed=self.random_seed)

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+        **kwargs,
+    ) -> list:
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
@ -685,4 +713,5 @@ class VisionArenaDataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"

-    tokenizer = get_tokenizer(tokenizer_id,
-                              trust_remote_code=args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_mode=args.tokenizer_mode,
+    )

    if args.dataset == 'grammar':
        args.structure_type = 'guided_grammar'
@ -876,6 +879,13 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
    parser.add_argument(
        "--num-prompts",
        type=int,
@ -989,9 +999,10 @@ if __name__ == "__main__":
                        type=float,
                        default=1.0,
                        help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
+    parser.add_argument(
+        "--structured-output-backend",
        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
        default="xgrammar",
        help="Backend to use for structured outputs")

--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -17,13 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES

-from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -167,69 +162,25 @@ class OpType(Enum):
    """
    LoRA Ops to benchmark and its properties.
    """
-    SGMV_SHRINK = auto()
-    BGMV_SHRINK = auto()
-    SGMV_EXPAND = auto()
-    BGMV_EXPAND = auto()
-    BGMV_EXPAND_SLICE = auto()
-    V1_SHRINK = auto()
-    V1_EXPAND = auto()
+    LORA_SHRINK = auto()
+    LORA_EXPAND = auto()

    @staticmethod
    def from_str(s: str) -> "OpType":
-        if s.lower() == 'sgmv_shrink':
-            return OpType.SGMV_SHRINK
-        if s.lower() == 'sgmv_expand':
-            return OpType.SGMV_EXPAND
-        if s.lower() == 'bgmv_shrink':
-            return OpType.BGMV_SHRINK
-        if s.lower() == 'bgmv_expand':
-            return OpType.BGMV_EXPAND
-        if s.lower() == "bgmv_expand_slice":
-            return OpType.BGMV_EXPAND_SLICE
-        if s.lower() == "v1_shrink":
-            return OpType.V1_SHRINK
-        if s.lower() == "v1_expand":
-            return OpType.V1_EXPAND
+        if s.lower() == "lora_shrink":
+            return OpType.LORA_SHRINK
+        if s.lower() == "lora_expand":
+            return OpType.LORA_EXPAND
        raise ValueError(f"Unrecognized str {s} to convert to OpType")

    def is_shrink_fn(self) -> bool:
-        return self in [
-            OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
-        ]
+        return self in [OpType.LORA_SHRINK]

    def is_expand_fn(self) -> bool:
-        return self in [
-            OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
-        ]
-
-    def is_prefill_op(self) -> bool:
-        return self in [
-            OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
-            OpType.V1_EXPAND
-        ]
-
-    def is_decode_op(self) -> bool:
-        return self in [
-            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
-            OpType.V1_SHRINK, OpType.V1_EXPAND
-        ]
-
-    def is_expand_slice_fn(self) -> bool:
-        return self in [OpType.BGMV_EXPAND_SLICE]
+        return self in [OpType.LORA_EXPAND]

    def num_slices(self) -> list[int]:
-        if self in [
-                OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
-                OpType.V1_EXPAND
-        ]:
-            # SGMV kernels and v1 kernels supports slices
        return [1, 2, 3]
-        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
-            return [1]
-        if self in [OpType.BGMV_EXPAND_SLICE]:
-            return [2, 3]
-        raise ValueError(f"Unrecognized OpType {self}")

    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
            lora_rank: int) -> tuple[int, int, int]:
@ -239,7 +190,7 @@ class OpType(Enum):
            k = hidden_size
            n = lora_rank
        else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
            m = num_tokens
            k = lora_rank
            n = hidden_size
@ -254,7 +205,7 @@ class OpType(Enum):
        if self.is_shrink_fn():
            return op_dtype, op_dtype, torch.float32
        else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
            return torch.float32, op_dtype, op_dtype

    def matmul_shapes(
@ -268,43 +219,19 @@ class OpType(Enum):
        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)

        b_shape = (num_loras, n, k)  # col-major
-        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
-            # SGMV shrink and V1 shrink kernels support num_slices inherently
-            # in the kernel.
+        if self in [OpType.LORA_SHRINK]:
+            # LoRA shrink kernels support num_slices inherently in the kernel.
            return ((m, k), b_shape, (num_slices, m, n))
-        if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
-            # SGMV expand and V1 expand kernels support num_slices inherently
-            # in the kernel
+        if self in [OpType.LORA_EXPAND]:
+            # LoRA expand kernels support num_slices inherently in the kernel
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
-        if self == OpType.BGMV_SHRINK:
-            return ((m, k), b_shape, (m, n))
-        if self == OpType.BGMV_EXPAND:
-            return ((m, k), b_shape, (m, n))
-        if self == OpType.BGMV_EXPAND_SLICE:
-            return ((num_slices, m, k), b_shape, (m, n * num_slices))
-
        raise ValueError(f"Unrecognized op_type {self}")

    def bench_fn(self) -> Callable:
-
-        def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
-            for x in kwargs_list:
-                bgmv_expand_slice(**x)
-
-        if self == OpType.SGMV_SHRINK:
-            return sgmv_shrink
-        if self == OpType.SGMV_EXPAND:
-            return sgmv_expand
-        if self == OpType.BGMV_SHRINK:
-            return bgmv_shrink
-        if self == OpType.BGMV_EXPAND:
-            return bgmv_expand
-        if self == OpType.BGMV_EXPAND_SLICE:
-            return emulate_bgmv_expand_slice
-        if self == OpType.V1_SHRINK:
-            return v1_shrink
-        if self == OpType.V1_EXPAND:
-            return v1_expand
+        if self == OpType.LORA_SHRINK:
+            return lora_shrink
+        if self == OpType.LORA_EXPAND:
+            return lora_expand

        raise ValueError(f"Unrecognized optype {self}")

@ -318,34 +245,13 @@ class OpType(Enum):
        """
        w_dtype = lora_weights[0].dtype
        num_slices = len(lora_weights)
-        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
+        if self in [OpType.LORA_SHRINK]:
            for slice_idx in range(num_slices):
                ref_group_gemm(ref_out=output[slice_idx, :],
                               input=input,
                               lora_weights=lora_weights[slice_idx],
                               **kwargs)
-        elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
-            hidden_size = lora_weights[0].shape[1]
-            for slice_idx in range(num_slices):
-                slice_offset = slice_idx * hidden_size
-                ref_group_gemm(
-                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
-                    input=input[slice_idx].clone().to(dtype=w_dtype),
-                    lora_weights=lora_weights[slice_idx],
-                    **kwargs)
-        elif self == OpType.BGMV_SHRINK:
-            assert num_slices == 1
-            ref_group_gemm(ref_out=output,
-                           input=input,
-                           lora_weights=lora_weights[0],
-                           **kwargs)
-        elif self == OpType.BGMV_EXPAND:
-            assert num_slices == 1
-            ref_group_gemm(ref_out=output,
-                           input=input.clone().to(dtype=w_dtype),
-                           lora_weights=lora_weights[0],
-                           **kwargs)
-        elif self == OpType.BGMV_EXPAND_SLICE:
+        elif self in [OpType.LORA_EXPAND]:
            hidden_size = lora_weights[0].shape[1]
            for slice_idx in range(num_slices):
                slice_offset = slice_idx * hidden_size
@ -411,13 +317,11 @@ class BenchmarkTensors:
    input: torch.Tensor
    lora_weights_lst: list[torch.Tensor]
    output: torch.Tensor
-    # metadata tensors
+    # LoRA kernel metadata
+    lora_kernel_meta: LoRAKernelMeta
+    # Metadata tensors used in testing correctness
    seq_lens: torch.Tensor
-    seq_start_loc: torch.Tensor
    prompt_lora_mapping: torch.Tensor
-    token_lora_mapping: torch.Tensor
-    # v1 kernel metadata
-    v1_kernel_meta: Optional[V1KernelMeta] = None

    def io_types(self) -> str:
        return (f"{dtype_to_str(self.input.dtype)}x"
@ -444,35 +348,29 @@ class BenchmarkTensors:
        assert ctx.num_active_loras <= ctx.num_loras
        total_tokens = ctx.batch_size * ctx.seq_length

+        # Make metadata tensors involved in correctness testing.
        # Prepare seq lens tensor
        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
                                       (ctx.batch_size, ))
-        # Prepare seq_start_loc tensor
-        seq_start_loc_tensor = torch.cumsum(torch.tensor(
-            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-                                            dim=0)
        assert total_tokens == seq_len_tensor.sum()
        # Prepare prompt lora indices tensor
        prompt_lora_indices_tensor = make_prompt_lora_mapping(
            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
-        # Prepare token lora indices tensor
+
+        # Make LoRAKernelMeta
        token_lora_indices_tensor = make_token_lora_mapping(
            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
            seq_len_tensor, "cpu")
-
-        v1_kernel_meta = None
-        if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
-            v1_kernel_meta = V1KernelMeta.make(
+        lora_kernel_meta = LoRAKernelMeta.make(
            max_loras=ctx.num_loras,
            max_num_tokens=token_lora_indices_tensor.size(0),
            device="cpu")
-            v1_kernel_meta.prepare_tensors(
+        lora_kernel_meta.prepare_tensors(
            token_lora_mapping=token_lora_indices_tensor)

        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
-                                seq_len_tensor, seq_start_loc_tensor,
-                                prompt_lora_indices_tensor,
-                                token_lora_indices_tensor, v1_kernel_meta)
+                                lora_kernel_meta, seq_len_tensor,
+                                prompt_lora_indices_tensor)

    def sanity_check(self) -> None:
        """
@ -482,9 +380,9 @@ class BenchmarkTensors:
        # check metadata tensors
        assert torch.sum(self.seq_lens) == num_tokens
        num_seqs = self.seq_lens.shape[0]
-        assert self.seq_start_loc.shape[0] == num_seqs
+        #assert self.seq_start_loc.shape[0] == num_seqs
        assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.token_lora_mapping.shape[0] == num_tokens
+        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens

    def to_device(self, device: str):
        """
@ -499,220 +397,27 @@ class BenchmarkTensors:
        self.input = to_device(self.input)
        self.output = to_device(self.output)
        self.seq_lens = to_device(self.seq_lens)
-        self.seq_start_loc = to_device(self.seq_start_loc)
        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
-        self.token_lora_mapping = to_device(self.token_lora_mapping)
        for i in range(len(self.lora_weights_lst)):
            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])

-        # v1 meta
-        if self.v1_kernel_meta:
-            for field_name in V1KernelMeta.__dataclass_fields__:
-                field = getattr(self.v1_kernel_meta, field_name)
+        # LoRA meta
+        for field_name in LoRAKernelMeta.__dataclass_fields__:
+            field = getattr(self.lora_kernel_meta, field_name)
            assert isinstance(field, torch.Tensor)
-                setattr(self.v1_kernel_meta, field_name, to_device(field))
+            setattr(self.lora_kernel_meta, field_name, to_device(field))

    def metadata(self) -> tuple[int, int, int]:
        """
        Return num_seqs, num_tokens and max_seq_len
        """
        num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.token_lora_mapping.shape[0]
+        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
        max_seq_len = torch.max(self.seq_lens).item()
        num_slices = len(self.lora_weights_lst)
        return num_seqs, num_tokens, max_seq_len, num_slices

-    def convert_to_sgmv_benchmark_tensors(self):
-        """
-        For sgmv punica kernels, when consecutive sequences have the
-        same LoRA ID, we just merge them together.
-        This happens in punica.py::compute_metadata
-        """
-
-        # Collapse seq_lens and seq_start_loc
-        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
-                                               return_counts=True)
-        cum_result = torch.cumsum(seq_lens, dim=0)
-        seq_start_loc = torch.zeros_like(seq_lens)
-        seq_start_loc[1:].copy_(cum_result[:-1])
-
-        # Collapse prompt mapping
-        prompt_lora_mapping = torch.unique_consecutive(
-            self.prompt_lora_mapping)
-
-        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
-         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
-
-        self.prompt_lora_mapping = prompt_lora_mapping.to(
-            dtype=self.prompt_lora_mapping.dtype)
-        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
-        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
-
-    def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
-        self.convert_to_sgmv_benchmark_tensors()
-        self.sanity_check()
-        self.to_device(self.input.device)
-
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
-        # Sanity check matrix shapes.
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, hidden_size]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        hidden_size = i_shape[1]
-        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == hidden_size
-        lora_rank = lw_shape[1]
-        # Expected output shape [num_slices, num_tokens, lora_rank]
-        assert len(o_shape) == 3
-        assert o_shape == (num_slices, num_tokens, lora_rank)
-
-        return {
-            'inputs': self.input,
-            'lora_a_weights': self.lora_weights_lst,
-            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
-            'seq_len_tensor': self.seq_lens,
-            'lora_indices_tensor': self.prompt_lora_mapping,
-            'batches': num_seqs,
-            'max_seq_length': max_seq_len,
-            'token_nums': num_tokens,
-            'scaling': 1.0,
-        }
-
-    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
-        self.convert_to_sgmv_benchmark_tensors()
-        self.sanity_check()
-        self.to_device(self.input.device)
-
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
-        # Sanity check matrix shapes.
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape : [num_slices, num_tokens, lora_rank]
-        assert len(i_shape) == 3
-        assert i_shape[0] == num_slices
-        assert i_shape[1] == num_tokens
-        lora_rank = i_shape[2]
-        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape : [num_tokens, hidden_size * num_slices]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size * num_slices)
-
-        return {
-            'inputs': self.input,
-            'lora_b_weights': self.lora_weights_lst,
-            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
-            'seq_len_tensor': self.seq_lens,
-            'lora_indices_tensor': self.prompt_lora_mapping,
-            'batches': num_seqs,
-            'max_seq_length': max_seq_len,
-            'token_nums': num_tokens,
-            'offset_start': 0,
-            'add_inputs': add_inputs,
-        }
-
-    def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
-        assert len(self.lora_weights_lst) == 1
-        self.to_device(self.input.device)
-
-        _, num_tokens, _, _ = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, hidden_size]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        hidden_size = i_shape[1]
-        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == hidden_size
-        lora_rank = lw_shape[1]
-        # Expected output shape [num_tokens, lora_rank]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, lora_rank)
-
-        return {
-            'inputs': self.input,
-            'lora_a_weights': self.lora_weights_lst[0],
-            'output_tensor': self.output,
-            'lora_indices_tensor': self.token_lora_mapping,
-            'scaling': 1.0
-        }
-
-    def as_bgmv_expand_kwargs(self, add_inputs: bool):
-        assert len(self.lora_weights_lst) == 1
-        self.to_device(self.input.device)
-
-        _, num_tokens, _, _ = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, lora_rank]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        lora_rank = i_shape[1]
-        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape [num_tokens, hidden_size]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size)
-
-        return {
-            'inputs': self.input,
-            'lora_b_weights': self.lora_weights_lst[0],
-            'output_tensor': self.output,
-            'lora_indices_tensor': self.token_lora_mapping,
-            'add_inputs': add_inputs
-        }
-
-    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
-        _, num_tokens, _, num_slices = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_slices, num_tokens, lora_rank]
-        assert len(i_shape) == 3
-        assert i_shape[0] == num_slices
-        assert i_shape[1] == num_tokens
-        lora_rank = i_shape[2]
-        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape [num_tokens, hidden_size * num_slices]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size * num_slices)
-
-        self.to_device(self.input.device)
-
-        kwargs_list = []
-        for i in range(num_slices):
-            kwargs_list.append({
-                'inputs': self.input[i],
-                'lora_b_weights': self.lora_weights_lst[i],
-                'output_tensor': self.output,
-                'lora_indices_tensor': self.token_lora_mapping,
-                'slice_offset': i * hidden_size,
-                'slice_size': hidden_size,
-                'add_inputs': add_inputs,
-            })
-        return {'kwargs_list': kwargs_list}
-
-    def as_v1_shrink_kwargs(self) -> dict[str, Any]:
-        assert self.v1_kernel_meta is not None
+    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
        self.sanity_check()
        self.to_device(self.input.device)

@ -737,17 +442,16 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
            'scaling': 1.0,
        }

-    def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-        assert self.v1_kernel_meta is not None
+    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
        self.sanity_check()
        self.to_device(self.input.device)

@ -773,12 +477,12 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
            'offset_start': 0,
            'add_inputs': add_inputs,
        }
@ -791,20 +495,10 @@ class BenchmarkTensors:
        else:
            assert add_inputs is not None

-        if op_type == OpType.SGMV_SHRINK:
-            return self.as_sgmv_shrink_kwargs()
-        if op_type == OpType.SGMV_EXPAND:
-            return self.as_sgmv_expand_kwargs(add_inputs)
-        if op_type == OpType.BGMV_SHRINK:
-            return self.as_bgmv_shrink_kwargs()
-        if op_type == OpType.BGMV_EXPAND:
-            return self.as_bgmv_expand_kwargs(add_inputs)
-        if op_type == OpType.BGMV_EXPAND_SLICE:
-            return self.as_bgmv_expand_slice_kwargs(add_inputs)
-        if op_type == OpType.V1_SHRINK:
-            return self.as_v1_shrink_kwargs()
-        if op_type == OpType.V1_EXPAND:
-            return self.as_v1_expand_kwargs(add_inputs)
+        if op_type == OpType.LORA_SHRINK:
+            return self.as_lora_shrink_kwargs()
+        if op_type == OpType.LORA_EXPAND:
+            return self.as_lora_expand_kwargs(add_inputs)
        raise ValueError(f"Unrecognized optype {self}")

    def test_correctness(self, op_type: OpType,
@ -993,10 +687,6 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
    for bench_ctx in bench_ctxs:
        for seq_len in args.seq_lengths:
            bench_ops: list[OpType] = args.op_types
-            if seq_len > 1:
-                # bench only prefill ops
-                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
-
            seq_len_timers = []
            for bench_op in bench_ops:
                for num_slices in bench_op.num_slices():
@ -1206,13 +896,13 @@ Benchmark LoRA kernels:
    {use_cuda_graph_recommendation()}

    list_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32

    model_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 

    range_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
            """,  # noqa: E501
        formatter_class=argparse.RawTextHelpFormatter)

--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
    --request-rate $qps \
    --result-filename "$FILENAME" \
+    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
    --port ${PORT:-8000}

  echo "Completed benchmark with QPS: $qps"
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 
+          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(

 }  // namespace vllm

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
@ -393,8 +393,8 @@ void reshape_and_cache(
                             CALL_RESHAPE_AND_CACHE)
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
@ -446,8 +446,8 @@ void reshape_and_cache_flash(
                             CALL_RESHAPE_AND_CACHE_FLASH);
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@ -24,7 +24,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  // sum of squares
  float ss = 0.0f;

-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -58,7 +58,7 @@ __device__ void compute_dynamic_per_token_scales(
  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};

  float block_absmax_val_maybe = 0.0f;
-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -103,7 +103,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;

-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -142,7 +142,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  int32_t const num_vec_elems = hidden_size >> 2;

 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> in = vec_input[i];

    vec4_t<float> x;
@ -206,7 +206,7 @@ __device__ void compute_dynamic_per_token_scales(
  float block_absmax_val_maybe = 0.0f;

 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> in = vec_input[i];
    vec4_t<scalar_t> const w = vec_weight[i];

@ -286,7 +286,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
 //  replace scaled_fp8_conversion_vec
 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> const in = vec_input[i];
    vec4_t<scalar_t> const w = vec_weight[i];

--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
 template<typename dst_t>
 static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_q2_K * x = (const block_q2_K *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int n   = tid/32;
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;
@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;

-    const int r = threadIdx.x/4;
+    const auto r = threadIdx.x/4;
    const int tid = r/2;
    const int is0 = r%2;
    const int l0 = 16*is0 + 4*(threadIdx.x%4);
@ -164,10 +164,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q4_K * x = (const block_q4_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 32 threads
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int is  = 2*il;
@ -197,10 +197,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q5_K * x = (const block_q5_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/16;   // il is in 0...3
    const int ir  = tid%16;   // ir is in 0...15
    const int is  = 2*il;     // is is in 0...6
@ -231,10 +231,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q6_K * x = (const block_q6_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int ip  = tid/32;   // ip is 0 or 1
    const int il  = tid - 32*ip; // 0...32
    const int is  = 8*ip + il/16;
@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_s * x = (const block_iq2_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_s * x = (const block_iq3_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst

 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_xs * x = (const block_iq4_xs *)vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -19,11 +19,11 @@ template <typename scalar_t>
 static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
                                     void* __restrict__ vy, const int kx,
                                     const int kx_padded) {
-  const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
  if (ix >= kx_padded) {
    return;
  }
-  const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
  const int i_padded = iy * kx_padded + ix;

  block_q8_1* y = (block_q8_1*)vy;
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(

    const int & ncols_dst = ncols_y;

-    const int row_dst_0 = blockIdx.x*mmq_y;
+    const auto row_dst_0 = blockIdx.x*mmq_y;
    const int & row_x_0 = row_dst_0;

-    const int col_dst_0 = blockIdx.y*mmq_x;
+    const auto col_dst_0 = blockIdx.y*mmq_x;
    const int & col_y_0 = col_dst_0;

    int   * tile_x_ql = nullptr;
@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
-            const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
+            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
            const int kbxd = kqs / QI8_1;

 #pragma unroll
@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
 #pragma unroll
            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
+                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);

                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
+        const auto col_dst = col_dst_0 + j + threadIdx.y;
        if (col_dst >= ncols_dst) {
            return;
        }

 #pragma unroll
        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
+            const auto row_dst = row_dst_0 + threadIdx.x + i;
            if (row_dst >= nrows_dst) {
                continue;
            }
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@ -1,7 +1,7 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
 template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const auto row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

-    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
        const int ibx = row*blocks_per_row + i; // x block index

        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(

  const int ncols_dst = ncols_y * top_k;

-  const int row_dst_0 = blockIdx.x * mmq_y;
+  const auto row_dst_0 = blockIdx.x * mmq_y;
  const int& row_x_0 = row_dst_0;

-  const int col_dst_0 = blockIdx.y * mmq_x;
+  const auto col_dst_0 = blockIdx.y * mmq_x;

  int token_offs[mmq_x / nwarps];
  for (int i = 0; i < mmq_x; i += nwarps) {
@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
    const int n_per_r = ((qk * blocks_per_warp) / qr);
 #pragma unroll
    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
-      const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
      const int kbxd = kqs / QI8_1;

 #pragma unroll
@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
      }

      if (threadIdx.x < n_per_r / QK8_1) {
-        const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
        const int col_y_eff = token_offs[threadIdx.y] / top_k;
        const int block_x =
            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(

 #pragma unroll
    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-      const int row_dst = row_dst_0 + threadIdx.x + i;
+      const auto row_dst = row_dst_0 + threadIdx.x + i;
      if (row_dst >= nrows_dst) {
        continue;
      }
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@ -199,12 +199,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -337,12 +337,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -458,12 +458,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -586,12 +586,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -765,14 +765,14 @@ __global__ void reconstruct_exllama_8bit_kernel(
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -862,14 +862,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -967,14 +967,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -1065,14 +1065,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -1181,11 +1181,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
  int zero_width = width / 8;
  int vec_height = height * 4;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 8;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
@ -1197,8 +1197,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
  }

  __shared__ half2 deq2[256][8];
-  int val = threadIdx.x / 8;
-  int off = threadIdx.x % 8;
+  auto val = threadIdx.x / 8;
+  auto off = threadIdx.x % 8;
  for (; val < 256; val += BLOCK_KN_SIZE / 8) {
    deq2[val][off] =
        __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
@ -1280,11 +1280,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
  int zero_width = width / 4;
  int vec_height = height * 2;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 4;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
@ -1393,8 +1393,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
                                        half* __restrict__ out) {
  // Start of block

-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32 / bit;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32 / bit;
  if (column >= width) return;

  // Views
@ -1425,8 +1425,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
    const int height, const int width, const int group,
    half* __restrict__ out) {
  // Start of block
-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32;
  if (column >= width) return;

  // Views
@ -1542,7 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,

 __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1555,7 +1555,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1568,7 +1568,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1581,7 +1581,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1599,9 +1599,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 3;
  uint64_t dst = 0;

@ -1630,9 +1630,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 4;
  uint64_t dst = 0;

@ -1658,10 +1658,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
                                            uint32_t* __restrict__ w_new,
                                            const int* __restrict__ q_perm,
                                            const int w_width) {
-  int w_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w_column >= w_width) return;
-  int w_new_row = blockIdx.y * 3;
-  int q_perm_idx = blockIdx.y << 5;
+  auto w_new_row = blockIdx.y * 3;
+  auto q_perm_idx = blockIdx.y << 5;
  uint32_t dst[3] = {0, 0, 0};

 #pragma unroll
@ -1744,9 +1744,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 2;
  uint64_t dst = 0;

--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
                            blockIdx.z * params.SplitK * 4;

-    const int lane_id = threadIdx.x % WARP_SIZE;
+    const auto lane_id = threadIdx.x % WARP_SIZE;

    // For matrix A, a block load/store Mtile(row) x 32(col) elements in
    // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
-    const int Aldg_row_base_idx = threadIdx.x / 4;
+    const auto Aldg_row_base_idx = threadIdx.x / 4;
    Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
    const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;

@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
    // * 128(col) per iter
    Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
-    const int Bldg_row_base_idx = threadIdx.x / 8;
+    const auto Bldg_row_base_idx = threadIdx.x / 8;
    const int Bldg_base_offset =
        Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;

@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    B_ldg_guard = 0;
  #pragma unroll
    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
-      int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
+      auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
      if (m_idx < params.M) {
        A_ldg_guard |= (1u << i);
      }
@ -98,7 +98,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    const int N_padded = (params.N + 31) / 32 * 32;
  #pragma unroll
    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
-      int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
+      auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
                   i * N_SIZE_ONE_LOAD;
      if (n_idx < N_padded) {
        B_ldg_guard |= (1u << i);
@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
  __device__ void fused_splitk_reduce() {
    // need splitk-reduce if enable splitk
    if (gridDim.z > 1) {
-      int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
+      auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
      // Wait for all previous blocks in the splitk direction to accumulate the
      // results into C_tmp
      if (threadIdx.x == 0) {
@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      }
      __syncthreads();

-      int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
+      auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
      if (blockIdx.z != 0) {
        // expecting that temporary register here reuses the previous A&B frag
        // register
@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {

    FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
    // C_tile lds and stg
-    int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
+    auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
    bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
    if (WARP_NTILE == 32) {
      int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
@ -580,7 +580,7 @@ __global__ void __launch_bounds__(BLOCK)
  int sts_stage_idx = 0;
  int lds_stage_idx = 0;

-  int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
+  auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
                        ? params.SplitK
                        : params.K - blockIdx.z * params.SplitK;
  int k_tiles = (tb_k_slice + 31) / 32;
@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
    const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
    const int N_32align, const int N, const int K) {
  __shared__ FT smem[64 * 32];
-  int warp_id = threadIdx.x / 32;
-  int lane_id = threadIdx.x % 32;
-  const int src_row_idx = blockIdx.x * 8 + lane_id / 4;
+  auto warp_id = threadIdx.x / 32;
+  auto lane_id = threadIdx.x % 32;
+  const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
  const int src_col_idx =
      blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
  const int src_offset = src_row_idx * K * 4 + src_col_idx;
-  int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
+  auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;

  QT qval_reg[16];
  const QT* pdata = qdata + src_offset;
@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
        *reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
  }

-  const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
-  const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
+  const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
+  const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
  #pragma unroll
  for (int i = 0; i < 2; ++i) {
    int dst_row_kidx = dst_row_base_kidx + i * 32;
--- a/csrc/quantization/gptq_allspark/allspark_repack.cu
+++ b/csrc/quantization/gptq_allspark/allspark_repack.cu
@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128)
        const uint8_t* B, const FType* B_scale, const FType* B_zero,
        uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
        const int K, const int N, const int N_32align) {
-  const int lane_id = threadIdx.x % 32;
-  const int warp_id = threadIdx.x / 32;
+  const auto lane_id = threadIdx.x % 32;
+  const auto warp_id = threadIdx.x / 32;

  if (blockIdx.x != gridDim.x - 1) {
    // Load B
@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128)
    }

    // Store B
-    const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
+    const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
    const int dst_col_idx =
        blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
    for (int i = 0; i < 8; ++i) {
@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128)
  } else {
    // Load B_scale and B_zero
    FType b_scale_reg, b_zero_reg;
-    int src_offset = blockIdx.y * 128 + threadIdx.x;
+    auto src_offset = blockIdx.y * 128 + threadIdx.x;
    ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
    if (B_zero != nullptr)
      ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@ -62,7 +62,7 @@ template <typename FType, int BLOCK, int N_MATRIX>
 __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
                                              uint32_t n, uint32_t n_matrix,
                                              uint32_t matrix_size) {
-  int idx = blockIdx.x * BLOCK + threadIdx.x;
+  auto idx = blockIdx.x * BLOCK + threadIdx.x;

  if (idx >= matrix_size) {
    return;
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@ -124,3 +124,52 @@ nsys stats report1.nsys-rep
 GUI example:

 <img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
+
+## Profiling vLLM Python Code
+
+The Python standard library includes
+[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
+code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
+Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+used to profile a section of code.
+
+### Example usage - decorator
+
+The first helper is a Python decorator that can be used to profile a function.
+If a filename is specified, the profile will be saved to that file. If no filename is
+specified, profile data will be printed to stdout.
+
+```python
+import vllm.utils
+
+@vllm.utils.cprofile("expensive_function.prof")
+def expensive_function():
+    # some expensive code
+    pass
+```
+
+### Example Usage - context manager
+
+The second helper is a context manager that can be used to profile a block of
+code. Similar to the decorator, the filename is optional.
+
+```python
+import vllm.utils
+
+def another_function():
+    # more expensive code
+    pass
+
+with vllm.utils.cprofile_context("another_function.prof"):
+    another_function()
+```
+
+### Analyzing Profile Results
+
+There are multiple tools available that can help analyze the profile results.
+One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
+
+```bash
+pip install snakeviz
+snakeviz expensive_function.prof
+```
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:

 ```Dockerfile
-FROM vllm/vllm-openai:v0.7.3
+FROM vllm/vllm-openai:v0.8.0

 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.7.3
+RUN uv pip install vllm[audio,video]==0.8.0
 ```

 :::
@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest

-RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+RUN uv pip install git+https://github.com/huggingface/transformers.git
 ```

 :::
--- a/docs/source/deployment/frameworks/lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.

 vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.

-Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
-deploying vLLM on Kubernetes using LWS.
+## Prerequisites
+
+* At least two Kubernetes nodes, each with 8 GPUs, are required.
+* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
+
+## Deploy and Serve
+
+Deploy the following yaml file `lws.yaml`
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+spec:
+  replicas: 2
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: docker.io/vllm/vllm-openai:latest
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            readinessProbe:
+              tcpSocket:
+                port: 8080
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: docker.io/vllm/vllm-openai:latest
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm   
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
+```
+
+```bash
+kubectl apply -f lws.yaml
+```
+
+Verify the status of the pods:
+
+```bash
+kubectl get pods
+```
+
+Should get an output similar to this:
+
+```bash
+NAME       READY   STATUS    RESTARTS   AGE
+vllm-0     1/1     Running   0          2s
+vllm-0-1   1/1     Running   0          2s
+vllm-1     1/1     Running   0          2s
+vllm-1-1   1/1     Running   0          2s
+```
+
+Verify that the distributed tensor-parallel inference works:
+
+```bash
+kubectl logs vllm-0 |grep -i "Loading model weights took" 
+```
+
+Should get something similar to this:
+
+```text
+INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
+(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
+```
+
+## Access ClusterIP service
+
+```bash
+# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
+kubectl port-forward svc/vllm-leader 8080:8080
+```
+
+The output should be similar to the following:
+
+```text
+Forwarding from 127.0.0.1:8080 -> 8080
+Forwarding from [::1]:8080 -> 8080
+```
+
+## Serve the model
+
+Open another terminal and send a request
+
+```text
+curl http://localhost:8080/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    "prompt": "San Francisco is a",
+    "max_tokens": 7,
+    "temperature": 0
+}'
+```
+
+The output should be similar to the following
+
+```text
+{
+  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+  "object": "text_completion",
+  "created": 1715138766,
+  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+  "choices": [
+    {
+      "index": 0,
+      "text": " top destination for foodies, with",
+      "logprobs": null,
+      "finish_reason": "length",
+      "stop_reason": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 5,
+    "total_tokens": 12,
+    "completion_tokens": 7
+  }
+}
+```
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we

 In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.

-**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-1.png
 :alt: Example Time 1
@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 3
 :::

-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-4.png
 :alt: Example Time 4
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -25,7 +25,7 @@ import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## Inflight quantization: load as 4bit quantization
@ -35,7 +35,7 @@ from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## OpenAI Compatible Server
@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
 Append the following to your 4bit model arguments:

 ```console
--quantization bitsandbytes --load-format bitsandbytes
+--quantization bitsandbytes
 ```
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation.md
@ -26,4 +26,3 @@ installation/ai_accelerator
  - Google TPU
  - Intel Gaudi
  - AWS Neuron
-  - OpenVINO
--- a/docs/source/getting_started/installation/ai_accelerator.md
+++ b/docs/source/getting_started/installation/ai_accelerator.md
@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
 :::::

 ## Requirements
@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
 :::::

 ## Configure a new environment
@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} python_env_setup.inc.md
-:::
-
-::::
-
 :::::

 ## Set up using Python
@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
 :::::

 ### Build wheel from source
@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
 :::::

 ## Set up using Docker
@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
 :::::

 ### Build image from source
@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-
-::::
-
 :::::

 ## Extra information
@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Extra information"
-:::
-
-::::
-
 :::::
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@ -1,110 +0,0 @@
-# Installation
-
-vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
-
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-
-## Requirements
-
- OS: Linux
- Instruction set architecture (ISA) requirement: at least AVX2.
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built OpenVINO wheels.
-
-### Build wheel from source
-
-First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
-
-```console
-sudo apt-get update  -y
-sudo apt-get install python3
-pip install --upgrade pip
-```
-
-Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
-
-```console
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-```
-
-Finally, install vLLM with OpenVINO backend:
-
-```console
-PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-```
-
-:::{tip}
-To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
-:::
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built OpenVINO images.
-
-### Build image from source
-
-```console
-docker build -f Dockerfile.openvino -t vllm-openvino-env .
-docker run -it --rm vllm-openvino-env
-```
-
-## Extra information
-
-## Supported features
-
-OpenVINO vLLM backend supports the following advanced vLLM features:
-
- Prefix caching (`--enable-prefix-caching`)
- Chunked prefill (`--enable-chunked-prefill`)
-
-## Performance tips
-
-### vLLM OpenVINO backend environment variables
-
- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
-
-### CPU performance tips
-
-CPU uses the following environment variables to control behavior:
-
- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
-
-To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
-
-OpenVINO best known configuration for CPU is:
-
-```console
-$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
-```
-
-### GPU performance tips
-
-GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
-
-Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
-
-OpenVINO best known configuration for GPU is:
-
-```console
-$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-## Limitations
-
- LoRA serving is not supported.
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -2,6 +2,8 @@

 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).

+To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
+
 ## Why vLLM V1?

 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -472,11 +472,21 @@ See [this page](#generative-models) for more information on how to use generativ
  * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
  * ✅︎
  * ✅︎
+- * `TeleFLMForCausalLM`
+  * TeleFLM
+  * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
+  * ✅︎
+  * ✅︎
 - * `XverseForCausalLM`
  * XVERSE
  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
  * ✅︎
  * ✅︎
+- * `Zamba2ForCausalLM`
+  * Zamba2
+  * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
+  *
+  *
 :::

 :::{note}
@ -763,7 +773,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
  * ✅︎
  * ✅︎
-  *
+  * ⚠️
 - * `GLM4VForCausalLM`<sup>^</sup>
  * GLM-4V
  * T + I
@ -879,7 +889,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `PixtralForConditionalGeneration`
  * Pixtral
  * T + I<sup>+</sup>
-  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
  *
  * ✅︎
  * ✅︎
@ -946,13 +956,10 @@ V0 correctly implements the model's attention pattern:

 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
 - Will be updated in the future to support the correct behavior
- Does not support `"do_pan_and_scan": True`

 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
-For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
 :::

 :::{note}
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should

 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.

-After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:

 ```console
 vllm serve /path/to/the/model/in/the/container \
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -29,6 +29,11 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```

+:::{tip}
+vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
+You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
+:::
+
 ## Supported APIs

 We currently support the following OpenAI APIs:
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -93,7 +93,6 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
-        lora_extra_vocab_size=0,
        limit_mm_per_prompt={"audio": audio_count},
    )

--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
        engine_args = EngineArgs(model=model,
                                 quantization=quantization,
                                 qlora_adapter_name_or_path=lora_repo,
-                                 load_format="bitsandbytes",
                                 enable_lora=True,
                                 max_lora_rank=64)
    else:
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -6,14 +6,16 @@ import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams

-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3.1
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
+#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Pixtral-12B-2409",
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
 #     "messages": [
 #       {
 #         "role": "user",
@ -44,13 +46,15 @@ from vllm.sampling_params import SamplingParams


 def run_simple_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    sampling_params = SamplingParams(max_tokens=8192)

    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
        max_model_len=4096,
        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
@ -83,7 +87,7 @@ def run_simple_demo(args: argparse.Namespace):


 def run_advanced_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    max_img_per_msg = 5
    max_tokens_per_img = 4096

@ -91,6 +95,8 @@ def run_advanced_demo(args: argparse.Namespace):
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+from vllm import LLM, SamplingParams
+
+# vLLM does not guarantee the reproducibility of the results by default,
+# for the sake of performance. You need to do the following to achieve
+# reproducible results:
+# 1. Turn off multiprocessing to make the scheduling deterministic.
+#    NOTE(woosuk): This is not needed and will be ignored for V0.
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+# 2. Fix the global seed for reproducibility. The default seed is None, which is
+# not reproducible.
+SEED = 42
+
+# NOTE(woosuk): Even with the above two settings, vLLM only provides
+# reproducibility when it runs on the same hardware and the same vLLM version.
+# Also, the online serving API (`vllm serve`) does not support reproducibility
+# because it is almost impossible to make the scheduling deterministic in the
+# online serving setting.
+
+llm = LLM(model="facebook/opt-125m", seed=SEED)
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -169,7 +169,6 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
        mm_processor_kwargs={"do_pan_and_scan": True},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
@ -682,7 +681,6 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
-        lora_extra_vocab_size=0,
    )

    return ModelRequestData(
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -91,8 +91,6 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
-        mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@ -342,7 +340,6 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
-        lora_extra_vocab_size=0,
    )

    placeholders = "".join(f"<|image_{i}|>"
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@ -8,6 +8,9 @@ set -xe
 echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
 sleep 1

+# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'cleanup' INT

@ -44,18 +47,20 @@ wait_for_server() {
 # You can also adjust --kv-ip and --kv-port for distributed inference.

 # prefilling instance, which is the KV producer
-CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
    --port 8100 \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &

 # decoding instance, which is the KV consumer
-CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
    --port 8200 \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &

@ -78,7 +83,7 @@ sleep 1
 output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "San Francisco is a",
 "max_tokens": 10,
 "temperature": 0
@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 output2=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "Santa Clara is a",
 "max_tokens": 10,
 "temperature": 0
--- a/examples/online_serving/disaggregated_prefill_zmq.sh
+++ b/examples/online_serving/disaggregated_prefill_zmq.sh
@ -0,0 +1,123 @@
+#!/bin/bash
+# This file demonstrates the example usage of disaggregated prefilling with ZMQ
+# We will launch 2 vllm instances (1 for prefill and 1 for decode),
+# and then transfer the KV cache between them.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
+sleep 1
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# a function that waits vLLM connect to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# a function that waits vLLM disagg to start
+wait_for_disagg_server() {
+  local log_file=$1
+  timeout 1200 bash -c "
+    until grep -q 'PDWorker is ready' $log_file; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# You can also adjust --kv-ip and --kv-port for distributed inference.
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+CONTROLLER_ADDR=controller.ipc
+PREFILL_WORKER_ADDR=prefill.ipc
+DECODE_WORKER_ADDR=decode.ipc
+PORT=8001
+
+# prefilling instance, which is the KV producer
+CUDA_VISIBLE_DEVICES=0 python3 ../../vllm/entrypoints/disaggregated/worker.py \
+    --model $MODEL \
+    --controller-addr $CONTROLLER_ADDR \
+    --worker-addr $PREFILL_WORKER_ADDR \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' > vllm_disagg_prefill.log 2>&1 &
+
+# decoding instance, which is the KV consumer
+CUDA_VISIBLE_DEVICES=1 python3 ../../vllm/entrypoints/disaggregated/worker.py \
+    --model $MODEL \
+    --controller-addr $CONTROLLER_ADDR \
+    --worker-addr $DECODE_WORKER_ADDR \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' > vllm_disagg_decode.log 2>&1 &
+
+# launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - Send req to prefill instance, wait until complete.
+# - Send req to decode instance, streaming tokens.
+python3 ../../vllm/entrypoints/disaggregated/api_server.py \
+    --port $PORT \
+    --model $MODEL \
+    --controller-addr $CONTROLLER_ADDR \
+    --prefill-addr $PREFILL_WORKER_ADDR \
+    --decode-addr $DECODE_WORKER_ADDR &
+
+# wait until prefill, decode instances and proxy are ready
+wait_for_server $PORT
+wait_for_disagg_server vllm_disagg_prefill.log
+wait_for_disagg_server vllm_disagg_decode.log 
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8001/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Llama-3.1-8B-Instruct",
+"prompt": "San Francisco is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8001/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Llama-3.1-8B-Instruct",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+
+# Cleanup commands
+pgrep python | xargs kill -9
+pkill -f python
+
+echo ""
+
+sleep 1
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
--- a/examples/template_teleflm.jinja
+++ b/examples/template_teleflm.jinja
@ -0,0 +1,12 @@
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '<_user>' + message['content']|trim }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<_system>' + message['content']|trim }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '<_bot>' + message['content'] }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<_bot>' }}
+{%- endif %}
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -18,6 +18,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
+llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -3,7 +3,8 @@

 # Dependencies for CPUs
 torch==2.6.0+cpu; platform_machine == "x86_64"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.6.0; platform_system == "Darwin"
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
--- a/requirements/openvino.txt
+++ b/requirements/openvino.txt
@ -1,8 +0,0 @@
-# Common dependencies
-r common.txt
-
-torch == 2.5.1 #  should be aligned with "common" vLLM torch version
-openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
-
-optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.2
-torch==2.5.1
-torchvision==0.20.1
-torchaudio==2.5.1
+--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0

 cmake>=3.26
 packaging
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
--- a/setup.py
+++ b/setup.py
@ -449,10 +449,6 @@ def _is_cpu() -> bool:
    return VLLM_TARGET_DEVICE == "cpu"


-def _is_openvino() -> bool:
-    return VLLM_TARGET_DEVICE == "openvino"
-
-
 def _is_xpu() -> bool:
    return VLLM_TARGET_DEVICE == "xpu"

@ -572,8 +568,6 @@ def get_vllm_version() -> str:
        if gaudi_sw_version != MAIN_CUDA_VERSION:
            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
            version += f"{sep}gaudi{gaudi_sw_version}"
-    elif _is_openvino():
-        version += f"{sep}openvino"
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@ -623,8 +617,6 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("neuron.txt")
    elif _is_hpu():
        requirements = _read_requirements("hpu.txt")
-    elif _is_openvino():
-        requirements = _read_requirements("openvino.txt")
    elif _is_tpu():
        requirements = _read_requirements("tpu.txt")
    elif _is_cpu():
@ -634,7 +626,7 @@ def get_requirements() -> list[str]:
    else:
        raise ValueError(
            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
-            "OpenVINO, or CPU.")
+            "or CPU.")
    return requirements


--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -60,7 +60,7 @@ class TestSetting:
        # embedding model
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
-            model_args=["--task", "embed"],
+            model_args=["--task", "embed", "--dtype", "bfloat16"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@ -4,34 +4,38 @@ import pickle

 import pytest
 import torch
-from torch._inductor.codecache import BypassFxGraphCache

-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.inductor_pass import (CallableInductorPass,
-                                            as_inductor_pass)
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
+from vllm.config import CompilationConfig


 def simple_callable(graph: torch.fx.Graph):
    pass


-@as_inductor_pass(files=(__file__, ))
-def callable_decorated(graph: torch.fx.Graph):
-    pass
+callable_uuid = CallableInductorPass(simple_callable,
+                                     InductorPass.hash_source(__file__))


@pytest.mark.parametrize(
    "works, callable",
-    [(False, simple_callable), (True, callable_decorated),
-     (True, CallableInductorPass(simple_callable, "simple_callable"))])
+    [
+        (False, simple_callable),
+        (True, callable_uuid),
+        (True, CallableInductorPass(simple_callable)),
+    ],
+)
 def test_pass_manager(works: bool, callable):
    config = CompilationConfig().pass_config
-    pass_manager = PostGradPassManager([callable])
-    pass_manager.configure(config)  # Adds default passes

+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    # Try to add the callable to the pass manager
    if works:
+        pass_manager.add(callable)
        pickle.dumps(pass_manager)
    else:
-        with pytest.raises(BypassFxGraphCache):
-            pickle.dumps(pass_manager)
+        with pytest.raises(AssertionError):
+            pass_manager.add(callable)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -14,8 +14,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          BatchFeature)
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from tests.models.utils import (TokensTextLogprobs,
@ -23,7 +23,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@ -34,8 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_list_of)
+from vllm.utils import cuda_device_count_stateless, is_list_of

 logger = init_logger(__name__)

@ -271,14 +270,17 @@ _R = TypeVar("_R")

 class HfRunner:

-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def get_default_device(self):
        from vllm.platforms import current_platform
+
+        return ("cpu" if current_platform.is_cpu() else "cuda")
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
        if x is None or isinstance(x, (bool, )):
            return x

        if device is None:
-            device = "cpu" if current_platform.is_cpu(
-            ) or current_platform.is_openvino() else "cuda"
+            device = self.device

        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
@ -291,45 +293,59 @@ class HfRunner:
    def __init__(
        self,
        model_name: str,
-        dtype: str = "half",
+        dtype: str = "auto",
        *,
        model_kwargs: Optional[dict[str, Any]] = None,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[..., BatchEncoding] = identity,
    ) -> None:
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
        self.model_name = model_name

+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
        if is_sentence_transformer:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(
-                SentenceTransformer(
+
+            self.model = SentenceTransformer(
                model_name,
-                    device="cpu",
+                device=self.device,
+                model_kwargs=model_kwargs,
                trust_remote_code=True,
-                ).to(dtype=torch_dtype))
+            )
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
-            self.model = CrossEncoder(model_name,
-                                      device="cpu",
-                                      trust_remote_code=True)
-            self.model.model = self.wrap_device(self.model.model)\
-                .to(dtype=torch_dtype)
-        else:
-            model_kwargs = model_kwargs if model_kwargs is not None else {}
-            self.model = self.wrap_device(
-                auto_cls.from_pretrained(
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=True,
+            )
+        else:
+            model = auto_cls.from_pretrained(
                model_name,
-                    torch_dtype=torch_dtype,
                trust_remote_code=True,
                **model_kwargs,
-                ))
+            )
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(self.device)
+
+            self.model = model

        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
@ -349,16 +365,13 @@ class HfRunner:
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer

-        self.dtype = dtype
-        self.postprocess_inputs = postprocess_inputs
-
    def get_inputs(
        self,
        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> list[BatchEncoding]:
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
        if images is not None:
            assert len(prompts) == len(images)

@ -368,7 +381,7 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)

-        all_inputs: list[BatchEncoding] = []
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: dict[str, Any] = {
                "text": prompt,
@ -384,7 +397,8 @@ class HfRunner:
                processor_kwargs["sampling_rate"] = sr

            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)

            all_inputs.append(inputs)

@ -417,7 +431,7 @@ class HfRunner:
        outputs: list[tuple[list[list[int]], list[str]]] = []
        for inputs in all_inputs:
            output_ids = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                **kwargs,
            )
@ -488,7 +502,7 @@ class HfRunner:
        all_logprobs: list[list[torch.Tensor]] = []
        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -569,7 +583,7 @@ class HfRunner:

        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -620,19 +634,15 @@ class HfRunner:
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]

-            encoder_inputs = self.wrap_device(
-                self.processor(**processor_kwargs),
-                device=self.model.device.type,
-            )
+            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(encoder_inputs)

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
-                decoder_input_ids = self.wrap_device(
-                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids,
-                    device=self.model.device.type,
-                )
+                decoder_inputs = self.tokenizer(decoder_prompt,
+                                                return_tensors="pt")
+                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
@ -684,6 +694,7 @@ class VllmRunner:
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
+
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
@ -701,10 +712,8 @@ class VllmRunner:
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
-        # Use smaller max model length, otherwise bigger model cannot run due
-        # to kv cache size limit.
        max_model_len: int = 1024,
-        dtype: str = "half",
+        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@ -9,6 +9,8 @@ import torch.distributed as dist
 from vllm import LLM, SamplingParams
 from vllm.distributed.parallel_state import get_world_group

+dist.init_process_group(backend="gloo")
+
 # Create prompts
 prompts = [
    "Hello, my name is",
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@ -64,7 +64,6 @@ def test_multi_chat():
 def test_chat_multi_image(image_urls: list[str]):
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
-        dtype="bfloat16",
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@ -14,7 +14,9 @@ from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

 MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]


@pytest.fixture(scope="module")
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@ -18,8 +18,6 @@ TEST_AUDIO_URLS = [
@pytest.fixture(scope="module")
 def server():
    args = [
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@ -24,8 +24,6 @@ def server():
    args = [
        "--task",
        "generate",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "32768",
        "--max-num-seqs",
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@ -25,8 +25,6 @@ def server():
    args = [
        "--task",
        "generate",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@ -28,8 +28,6 @@ def server():
    args = [
        "--task",
        "embed",
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -34,7 +34,7 @@ def phi3v_model_config():
                       tokenizer=PHI3V_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                       seed=0,
                       limit_mm_per_prompt={
                           "image": 2,
@ -58,7 +58,7 @@ def mllama_model_config():
                       tokenizer=MLLAMA_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                       seed=0,
                       limit_mm_per_prompt={
                           "image": 2,
@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
                               tokenizer=MLLAMA_MODEL_ID,
                               tokenizer_mode="auto",
                               trust_remote_code=True,
-                               dtype="bfloat16",
+                               dtype="auto",
                               seed=0,
                               limit_mm_per_prompt={
                                   "image": 2,
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from unittest.mock import Mock, patch
+from unittest.mock import patch

 import pytest
 import torch
@ -8,7 +8,6 @@ import torch
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
-from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL

@ -21,9 +20,9 @@ def clear_cache():


@pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("use_v1", [True, False])
-@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
 def test_env(
    name: str,
    use_v1: bool,
@ -49,15 +48,8 @@ def test_env(
                       RocmPlatform()):
                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
-            EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+            EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
            assert backend.get_name() == EXPECTED
-        elif device == "openvino":
-            with patch("vllm.attention.selector.current_platform",
-                       OpenVinoPlatform()), patch.dict('sys.modules',
-                                                       {'openvino': Mock()}):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
-            assert backend.get_name() == "OPENVINO"
        else:
            if name in ["XFORMERS", "FLASHINFER"]:
                with patch("vllm.attention.selector.current_platform",
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@ -15,6 +15,7 @@ NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 DTYPES = [torch.float16, torch.bfloat16]
+QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
@ -85,6 +86,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("sliding_window", [None, 256])
@pytest.mark.parametrize("fa_version", [2, 3])
+@pytest.mark.parametrize("q_dtype", QDTYPES)
@torch.inference_mode()
 def test_flash_attn_with_paged_kv(
    use_out: bool,
@ -97,11 +99,15 @@ def test_flash_attn_with_paged_kv(
    num_blocks: int,
    sliding_window: Optional[int],
    fa_version: int,
+    q_dtype: Optional[torch.dtype],
 ) -> None:
    torch.set_default_device("cuda")
    if not is_fa_version_supported(fa_version):
        pytest.skip(f"Flash attention version {fa_version} not supported due "
                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
+        pytest.skip("Flash attention with quantized inputs is only "
+                    "supported on version 3 with bfloat16 base type")

    current_platform.seed_everything(0)
    num_seqs = len(kv_lens)
@ -130,10 +136,28 @@ def test_flash_attn_with_paged_kv(

    q = query.unsqueeze(1)
    out = torch.empty_like(q) if use_out else None
+
+    maybe_quantized_query = q
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = torch.ones(scale_shape, dtype=torch.float32)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
    output = flash_attn_with_kvcache(
-        q=q,
-        k_cache=key_cache,
-        v_cache=value_cache,
+        q=maybe_quantized_query,
+        k_cache=maybe_quantized_key_cache,
+        v_cache=maybe_quantized_value_cache,
        out=out,
        softmax_scale=scale,
        causal=True,
@ -142,10 +166,17 @@ def test_flash_attn_with_paged_kv(
        softcap=soft_cap if soft_cap is not None else 0,
        window_size=window_size,
        fa_version=fa_version,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
    )
    output = output if not use_out else out
    output = output.squeeze(1)

+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+
    ref_output = ref_paged_attn(query=query,
                                key_cache=key_cache,
                                value_cache=value_cache,
@ -155,7 +186,7 @@ def test_flash_attn_with_paged_kv(
                                scale=scale,
                                soft_cap=soft_cap,
                                sliding_window=sliding_window)
-    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
        f"{torch.max(torch.abs(output - ref_output))}"


@ -171,6 +202,7 @@ def test_flash_attn_with_paged_kv(
@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("fa_version", [2, 3])
+@pytest.mark.parametrize("q_dtype", QDTYPES)
@torch.inference_mode()
 def test_varlen_with_paged_kv(
    use_out: bool,
@ -183,11 +215,15 @@ def test_varlen_with_paged_kv(
    soft_cap: Optional[float],
    num_blocks: int,
    fa_version: int,
+    q_dtype: Optional[torch.dtype],
 ) -> None:
    torch.set_default_device("cuda")
    if not is_fa_version_supported(fa_version):
        pytest.skip(f"Flash attention version {fa_version} not supported due "
                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
+        pytest.skip("Flash attention with quantized inputs is only "
+                    "supported on version 3 with bfloat16 base type")
    current_platform.seed_everything(0)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
@ -223,10 +259,28 @@ def test_varlen_with_paged_kv(
                                 dtype=torch.int32)

    out = torch.empty_like(query) if use_out else None
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = torch.ones(scale_shape, dtype=torch.float32)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
    output = flash_attn_varlen_func(
-        q=query,
-        k=key_cache,
-        v=value_cache,
+        q=maybe_quantized_query,
+        k=maybe_quantized_key_cache,
+        v=maybe_quantized_value_cache,
        out=out,
        cu_seqlens_q=cu_query_lens,
        seqused_k=kv_lens,
@ -238,6 +292,9 @@ def test_varlen_with_paged_kv(
        block_table=block_tables,
        softcap=soft_cap if soft_cap is not None else 0,
        fa_version=fa_version,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
    )
    output = output if not use_out else out

@ -252,5 +309,8 @@ def test_varlen_with_paged_kv(
        sliding_window=sliding_window,
        soft_cap=soft_cap,
    )
-    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
        f"{torch.max(torch.abs(output - ref_output))}"
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@ -26,7 +26,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
        # Test standard ROCm attention
        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
        assert (backend.get_name() == "ROCM_FLASH"
-                or backend.get_name() == "ROCM_ATTN_VLLM_V1")
+                or backend.get_name() == "TRITON_ATTN_VLLM_V1")

        # mla test for deepseek related
        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
-from pathlib import Path

 import pytest
-from huggingface_hub import snapshot_download

 import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators

-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-LORA_MODULE_DOWNLOAD_PATH = None  # Populated by download_and_prepare_lora_module() #noqa
-LORA_RANK = 8
-DEFAULT_MAX_LORAS = 16 * 3
-
-
-def download_and_prepare_lora_module():
-    """
-    Request submission is expensive when the LoRA adapters have their own
-    tokenizers. This is because, for each request with a new LoRA adapter ID,
-    the front-end loads the tokenizer from disk.
-
-    In this test, as we are comparing request processing times, we want to
-    minimize any extra activity. To this effect, we download the LoRA
-    adapter and remove all the tokenizer files, so the engine will default
-    to the base model tokenizer.
-    """
-    global LORA_MODULE_DOWNLOAD_PATH
-
-    LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
-    LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
-
-    tokenizer_files = [
-        'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
-        'tokenizer.model'
-    ]
-    for tokenizer_file in tokenizer_files:
-        del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
-        del_path.unlink(missing_ok=True)
+MODEL_PATH = "THUDM/chatglm3-6b"
+LORA_RANK = 64
+DEFAULT_MAX_LORAS = 4 * 3


@pytest.fixture(autouse=True)
@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora):
    pass


-def get_lora_requests() -> list[LoRARequest]:
+def get_lora_requests(lora_path) -> list[LoRARequest]:
    lora_requests: list[LoRARequest] = [
-        LoRARequest(lora_name=f"{i}",
-                    lora_int_id=i,
-                    lora_path=LORA_MODULE_DOWNLOAD_PATH)
+        LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
        for i in range(1, DEFAULT_MAX_LORAS + 1)
    ]
    return lora_requests
@ -93,7 +63,7 @@ async def requests_processing_time(llm,


@pytest.mark.asyncio
-async def test_add_lora():
+async def test_add_lora(chatglm3_lora_files):
    """ 
    The add_lora function is used to pre-load some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
@ -103,10 +73,7 @@ async def test_add_lora():
    We measure the request processing time in both cases and expect the time 
    to be lesser in the case with add_lora() calls.
    """
-
-    download_and_prepare_lora_module()
-
-    lora_requests: list[LoRARequest] = get_lora_requests()
+    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)

    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
    # Create engine in eager-mode. Due to high max_loras, the CI can
@ -118,6 +85,7 @@ async def test_add_lora():
        max_lora_rank=LORA_RANK,
        max_model_len=128,
        gpu_memory_utilization=0.8,  #avoid OOM
+        trust_remote_code=True,
        enforce_eager=True)

    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@ -84,9 +84,11 @@ def v1(run_with_both_engines_lora):
@create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):

-    llm = vllm.LLM(MODEL_PATH,
+    llm = vllm.LLM(
+        MODEL_PATH,
        enable_lora=True,
-                   max_num_seqs=16,
+        # also test odd max_num_seqs
+        max_num_seqs=13,
        max_loras=4,
        tensor_parallel_size=1,
        enable_chunked_prefill=True)
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@ -4,18 +4,13 @@ from threading import Lock
 import pytest
 import torch

-import vllm.lora.ops.triton_ops  # noqa: F401
-import vllm.lora.ops.triton_ops.v1  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
+import vllm.lora.ops.torch_ops as torch_ops
+import vllm.lora.ops.triton_ops as triton_ops
+from vllm.lora.ops.triton_ops import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.lora.ops.triton_ops.v1 import V1KernelMeta
 from vllm.platforms import current_platform

-from .utils import (PunicaTensors, assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
+from .utils import PunicaTensors, assert_close, generate_data_for_nslices


 # Utility shrink and expand operations used as reference implementations.
@ -26,10 +21,10 @@ def sgmv_shrink_for_nslices(
        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
        num_tokens: int, scaling: float):
    """
-    Wrapper around sgmv_shrink that handles any nslices.
+    Wrapper around torch_ops.sgmv_shrink that handles any nslices.
    """
    for index in range(nslices):
-        sgmv_shrink(
+        torch_ops.sgmv_shrink(
            inputs_tensor,
            lora_weights_lst[index],
            out_tensor[index],
@ -53,11 +48,11 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
                            max_seq_length: int, num_tokens: int,
                            add_inputs: bool) -> None:
    """
-    Wrapper around sgmv_expand that handles any nslices.
+    Wrapper around torch_ops.sgmv_expand that handles any nslices.
    """
    if nslices == 1:
        # Verify the torch's sgmv_expand op
-        sgmv_expand(
+        torch_ops.sgmv_expand(
            inputs_tensor[0],
            lora_weights_lst[0],
            out_tensor,
@ -73,7 +68,7 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
        slice_offset = 0
        for index in range(nslices):
            lora_weights = lora_weights_lst[index]
-            sgmv_expand_slice(
+            torch_ops.sgmv_expand_slice(
                inputs_tensor[index],
                lora_weights,
                out_tensor,
@ -93,12 +88,13 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
 _dict_lock = Lock()


-def check_shrink_kernels(batches: int, num_loras: int, rank: int,
-                         hidden_size: int, nslices: int, dtype: torch.dtype,
-                         device: str, seq_length: int, scaling: float):
+def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             scaling: float):
    """
-    Compare outputs of vllm.sgmv_shrink and vllm.v1_shrink kernel against a
-    reference implementation.
+    Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
+    kernels.
    """
    data: PunicaTensors = generate_data_for_nslices(
        batches,
@ -118,35 +114,24 @@ def check_shrink_kernels(batches: int, num_loras: int, rank: int,
                      data.prompt_lora_mapping, batches, max_seq_length,
                      token_nums)

-    # Setup metadata information for the V1 kernel.
-    v1_meta = V1KernelMeta.make(max_loras=num_loras,
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
                                    max_num_tokens=token_nums,
                                    device='cuda')
-    v1_meta.prepare_tensors(data.token_lora_mapping)
+    lora_meta.prepare_tensors(data.token_lora_mapping)

    ref_out_tensor = data.ref_out_tensor
-    sgmv_out_tensor = data.our_out_tensor
-    v1_out_tensor = data.our_out_tensor.clone()
+    out_tensor = data.our_out_tensor.clone()

    # Preventing cache error pointer.
    with _dict_lock:
-        # SGMV shrink kernel
+        # lora_shrink kernel
        _LORA_A_PTR_DICT.clear()
-        torch.ops.vllm.sgmv_shrink(
+        triton_ops.lora_shrink(
            data.inputs_tensor,
            data.lora_weights,
-            sgmv_out_tensor,
-            *sgmv_meta_args,
-            scaling,
-        )
-
-        # V1 shrink kernel
-        _LORA_A_PTR_DICT.clear()
-        torch.ops.vllm.v1_shrink(
-            data.inputs_tensor,
-            data.lora_weights,
-            v1_out_tensor,
-            *v1_meta.meta_args(token_nums=token_nums),
+            out_tensor,
+            *lora_meta.meta_args(token_nums=token_nums),
            scaling,
        )

@ -160,16 +145,16 @@ def check_shrink_kernels(batches: int, num_loras: int, rank: int,
        scaling,
    )

-    assert_close(sgmv_out_tensor, ref_out_tensor)
-    assert_close(v1_out_tensor, ref_out_tensor)
+    assert_close(out_tensor, ref_out_tensor)


-def check_expand_kernels(batches: int, num_loras: int, rank: int,
-                         hidden_size: int, nslices: int, dtype: torch.dtype,
-                         device: str, seq_length: int, add_inputs: bool):
+def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             add_inputs: bool):
    """
-    Compare outputs of vllm.sgmv_expand and vllm.v1_expand kernels against a
-    reference implementation.
+    Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
+    kernels.
    """
    data: PunicaTensors = generate_data_for_nslices(
        batches,
@ -190,35 +175,23 @@ def check_expand_kernels(batches: int, num_loras: int, rank: int,
                      data.prompt_lora_mapping, batches, max_seq_length,
                      token_nums)

-    # Setup metadata information for the V1 kernel.
-    v1_meta = V1KernelMeta.make(max_loras=num_loras,
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
                                    max_num_tokens=token_nums,
                                    device='cuda')
-    v1_meta.prepare_tensors(data.token_lora_mapping)
+    lora_meta.prepare_tensors(data.token_lora_mapping)

    # Setup output tensors
    ref_out_tensor = data.ref_out_tensor
-    sgmv_out_tensor = data.our_out_tensor
-    v1_out_tensor = data.our_out_tensor.clone()
+    out_tensor = data.our_out_tensor.clone()

    with _dict_lock:
-        # SGMV expand kernel
+        # lora_expand kernel
        _LORA_B_PTR_DICT.clear()
-        torch.ops.vllm.sgmv_expand(
-            data.inputs_tensor,
+        triton_ops.lora_expand(data.inputs_tensor,
                               data.lora_weights,
-            sgmv_out_tensor,
-            *sgmv_meta_args,
-            offset_start=0,
-            add_inputs=add_inputs,
-        )
-
-        # V1 expand kernel
-        _LORA_B_PTR_DICT.clear()
-        torch.ops.vllm.v1_expand(data.inputs_tensor,
-                                 data.lora_weights,
-                                 v1_out_tensor,
-                                 *v1_meta.meta_args(token_nums=token_nums),
+                               out_tensor,
+                               *lora_meta.meta_args(token_nums=token_nums),
                               offset_start=0,
                               add_inputs=add_inputs)

@ -231,124 +204,7 @@ def check_expand_kernels(batches: int, num_loras: int, rank: int,
                            *sgmv_meta_args,
                            add_inputs=add_inputs)

-    assert_close(sgmv_out_tensor, ref_out_tensor)
-    assert_close(v1_out_tensor, ref_out_tensor)
-
-
-def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, dtype: torch.dtype, device: str,
-                      scaling: float):
-    """
-    Compare vllm.bgmv_shrink against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        "shrink",
-        device,
-    )
-
-    torch.ops.vllm.bgmv_shrink(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.our_out_tensor,
-        data.token_lora_mapping,
-        scaling,
-    )
-
-    bgmv_shrink(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.ref_out_tensor,
-        data.token_lora_mapping,
-        scaling,
-    )
-
-    data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
-
-
-def check_bgmv_expand(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, dtype: torch.dtype, device: str,
-                      add_inputs: bool):
-    """
-    Compare vllm.bgmv_expand against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        "expand",
-        device,
-    )
-
-    torch.ops.vllm.bgmv_expand(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.our_out_tensor,
-        data.token_lora_mapping,
-        add_inputs=add_inputs,
-    )
-    bgmv_expand(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.ref_out_tensor,
-        data.token_lora_mapping,
-        add_inputs=add_inputs,
-    )
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
-
-
-def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
-                            hidden_size: int, nslices: int, dtype: torch.dtype,
-                            device: str, add_inputs: bool):
-    """
-    Compare vllm.bgmv_expand_slice against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-
-    slice_offset = 0
-    for index in range(nslices):
-        torch.ops.vllm.bgmv_expand_slice(
-            data.inputs_tensor,
-            data.lora_weights[index],
-            data.our_out_tensor,
-            data.token_lora_mapping,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=add_inputs,
-        )
-        bgmv_expand_slice(
-            data.inputs_tensor,
-            data.lora_weights[index],
-            data.ref_out_tensor,
-            data.token_lora_mapping,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=add_inputs,
-        )
-
-        slice_offset += hidden_size
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
+    assert_close(out_tensor, ref_out_tensor)


 # Tests
@ -490,13 +346,13 @@ def test_kernels(
    op_type: str,
 ):
    """
-    Tests SGMV and V1 kernels.
+    Tests LoRA kernels.
    """
    torch.set_default_device(device)
    current_platform.seed_everything(seed)

    if op_type == "shrink":
-        check_shrink_kernels(batches=batches,
+        check_lora_shrink_kernel(batches=batches,
                                 num_loras=num_loras,
                                 rank=rank,
                                 hidden_size=hidden_size,
@ -506,7 +362,7 @@ def test_kernels(
                                 seq_length=128,
                                 scaling=0.5)
    else:
-        check_expand_kernels(batches=batches,
+        check_lora_expand_kernel(batches=batches,
                                 num_loras=num_loras,
                                 rank=rank,
                                 hidden_size=hidden_size,
@ -538,13 +394,13 @@ def test_kernels_hidden_size(
    op_type: str,
 ):
    """
-    Tests SGMV and V1 kernels.
+    Tests SGMV and LoRA kernels.
    """
    torch.set_default_device(device)
    current_platform.seed_everything(seed)

    if op_type == "shrink":
-        check_shrink_kernels(batches=batches,
+        check_lora_shrink_kernel(batches=batches,
                                 num_loras=num_loras,
                                 rank=rank,
                                 hidden_size=hidden_size,
@ -554,7 +410,7 @@ def test_kernels_hidden_size(
                                 seq_length=128,
                                 scaling=0.5)
    else:
-        check_expand_kernels(batches=batches,
+        check_lora_expand_kernel(batches=batches,
                                 num_loras=num_loras,
                                 rank=rank,
                                 hidden_size=hidden_size,
@ -563,134 +419,3 @@ def test_kernels_hidden_size(
                                 device=device,
                                 seq_length=128,
                                 add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    device: str,
-    seed: int,
-    op_type: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    if op_type == "shrink":
-        check_bgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          scaling=0.5)
-    else:
-        check_bgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_bgmv_hidden_size(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    device: str,
-    seed: int,
-    op_type: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    if op_type == "shrink":
-        check_bgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          scaling=0.5)
-    else:
-        check_bgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
-                                    hidden_size: int, nslices: int,
-                                    dtype: torch.dtype, device: str,
-                                    seed: int):
-
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    check_bgmv_expand_slice(batches=batches,
-                            num_loras=num_loras,
-                            rank=rank,
-                            hidden_size=hidden_size,
-                            nslices=nslices,
-                            dtype=dtype,
-                            device=device,
-                            add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
-                                                rank: int, hidden_size: int,
-                                                nslices: int,
-                                                dtype: torch.dtype,
-                                                device: str, seed: int):
-
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    check_bgmv_expand_slice(batches=batches,
-                            num_loras=num_loras,
-                            rank=rank,
-                            hidden_size=hidden_size,
-                            nslices=nslices,
-                            dtype=dtype,
-                            device=device,
-                            add_inputs=True)
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@ -24,12 +24,10 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
    )
    lora_request = LoRARequest("1", 1, sql_lora_files)
    assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
-        request_id="request_id", prompt="prompt", lora_request=lora_request)
+        prompt="prompt", lora_request=lora_request)
    assert reference_tokenizer.encode(
        "prompt") == await tokenizer_group.encode_async(
-            request_id="request_id",
-            prompt="prompt",
-            lora_request=lora_request)
+            prompt="prompt", lora_request=lora_request)
    assert isinstance(tokenizer_group.get_lora_tokenizer(None),
                      PreTrainedTokenizerBase)
    assert tokenizer_group.get_lora_tokenizer(
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@ -7,7 +7,10 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                   ReLUSquaredActivation,
                                                   SiluAndMul)
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import (
+    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
+    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.platforms import current_platform


 # Registered subclass for test
@ -87,3 +90,27 @@ def test_enabled_ops_invalid(env: str):
            custom_ops=env.split(",")))
        with set_current_vllm_config(vllm_config):
            RMSNorm(1024).enabled()
+
+
+@pytest.mark.parametrize("add_residual", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="AITER is a feature exclusive for ROCm")
+def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
+                           use_rocm_aiter_norm: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
+    rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
+
+    if not add_residual:
+        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
+                use_rocm_aiter_norm):
+            assert rms_norm_func == rocm_aiter_rms_norm
+        else:
+            assert rms_norm_func == rms_norm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_norm):
+        assert rms_norm_func == rocm_aiter_fused_add_rms_norm
+    else:
+        assert rms_norm_func == fused_add_rms_norm
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@ -16,7 +16,9 @@ from vllm.model_executor.guided_decoding.outlines_logits_processors import (
 from vllm.sampling_params import GuidedDecodingParams

 MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]
 GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
 REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@ -5,11 +5,10 @@ from typing import Optional
 import numpy as np
 import pytest
 import pytest_asyncio
-from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from transformers import AutoModel, AutoTokenizer

 from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
@ -107,8 +106,6 @@ def run_test(
    **kwargs,
 ):
    """Inference result should be the same between hf and vllm."""
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
@ -124,15 +121,7 @@ def run_test(
            for vllm_prompt, _, audio in prompts_and_audios
        ]

-    def process(hf_inputs: BatchEncoding, **kwargs):
-        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModel) as hf_model:
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
                [hf_prompt],
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@ -9,7 +9,7 @@ from vllm.sampling_params import SamplingParams
 from ...utils import check_outputs_equal

 # This test is for the hybrid models
-MODELS = ["ai21labs/Jamba-tiny-dev"]
+MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"]
 # Bamba at Fp32 is too big for the CI (L4 GPU).
 # MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]

@ -27,17 +27,19 @@ def test_models(
 ) -> None:

    # numeric error produces different generation
-    if 'Bamba' in model:
+    if "Bamba" in model:
        example_prompts.pop(3)

-    with hf_runner(
-            model,
-            dtype=dtype,
    model_kwargs = {
-                "use_mamba_kernels":
-                False,  # mamba kernels are not installed so HF 
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
        # don't use them
-            }) as hf_model:
+    }
+    if "Zamba2" in model:
+        # Zamba2 HF implementation automatically checks if mamba kernels are
+        # installed
+        model_kwargs = {}
+
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

    with vllm_runner(model, dtype=dtype) as vllm_model:
@ -112,26 +114,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                model: str, dtype: str,
                                max_tokens: int) -> None:
-    # numeric error during prefill chucking produces different generation
+    # numeric error during prefill chunking produces different generation
    # compared to w/o prefill chunking for those examples, removed them for now
-    if 'Jamba' in model:
+    if "Jamba" in model:
        example_prompts.pop(7)
        example_prompts.pop(2)
        example_prompts.pop(1)
-    elif 'Bamba' in model:
+    elif "Bamba" in model:
        example_prompts.pop(6)
        example_prompts.pop(3)
        example_prompts.pop(2)
        dtype = "half"  # use a different dtype for Bamba
+    elif "Zamba2" in model:
+        example_prompts.pop(7)
+        dtype = "half"

-    with hf_runner(
-            model,
-            dtype=dtype,
    model_kwargs = {
-                "use_mamba_kernels":
-                False,  # mamba kernels are not installed so HF 
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
        # don't use them
-            }) as hf_model:
+    }
+    if "Zamba2" in model:
+        # Zamba2 HF implementation automatically checks if mamba kernels are
+        # installed
+        model_kwargs = {}
+
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
        non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)

    with vllm_runner(model,
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@ -3,7 +3,11 @@

 Run `pytest tests/models/test_models.py`.
 """
+
 import pytest
+import torch
+
+from vllm.platforms import current_platform

 from ...utils import check_logprobs_close

@ -13,7 +17,21 @@ from ...utils import check_logprobs_close
 # https://github.com/vllm-project/vllm/issues/14524
 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]

+# This list contains the model that are using AITER kernel.
+# Skip model that are not using AITER tests.
+# When more AITER kernels are added, this list will not be
+# needed as all the models will be calling AITER kernels
+# in parts of the operators
+AITER_MODEL_LIST = [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openbmb/MiniCPM3-4B",
+    "Qwen/Qwen-7B",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "ehristoforu/Falcon3-MoE-2x7B-Insruct",
+]

+
+# @maybe_test_rocm_aiter
@pytest.mark.parametrize(
    "model",
    [
@ -69,19 +87,24 @@ REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    monkeypatch,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int,
+                use_rocm_aiter: bool, monkeypatch) -> None:
+
    if model in REQUIRES_V0:
        monkeypatch.setenv("VLLM_USE_V1", "0")

+    if use_rocm_aiter and (model in AITER_MODEL_LIST):
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+    elif use_rocm_aiter and model not in AITER_MODEL_LIST:
+        # Skip model that are not using AITER tests.
+        # When more AITER kernels are added, this list will not be
+        # needed as all the models will be calling AITER kernels
+        # in parts of the operators
+        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
+
    with hf_runner(model, dtype=dtype) as hf_model:
        if model.startswith("THUDM/chatglm3"):
            hf_model.model.get_output_embeddings = lambda: \
@ -100,3 +123,10 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
+    if use_rocm_aiter:
+        # this is to ensure that vllm engine
+        # has deallocated the memory before running the next
+        # unit tests. On ROCm, when using AITER
+        # the memory might not be deallocated completely
+        # before running the next test case
+        torch.cuda.synchronize()
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@ -9,7 +9,7 @@ from pathlib import PosixPath

 import pytest
 from packaging.version import Version
-from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION

 from vllm.platforms import current_platform
@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
            "stop_sign": "caption es",
            "cherry_blossom": "What is in the picture?",
        }),
-        auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
        dtype="bfloat16",
        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
    #     }),
    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-    #     postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
    #     stop_str=["<|im_end|>"],
    #     image_size_factors=[(0.10, 0.15)],
    #     max_tokens=64,
@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
    ),
    "chameleon": VLMTestInfo(
@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
+        auto_cls=AutoModelForImageTextToText,
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
-        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
-        # TODO: Use AutoModelForVision2Seq once transformers supports this
-        auto_cls=AutoModelForPreTraining,
-        dtype="bfloat16",
+        auto_cls=AutoModelForImageTextToText,
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
    ),
@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
        }),
        max_model_len=2048,
        max_num_seqs=2,
-        dtype="bfloat16",
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        num_logprobs=10,
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
    ),
    "intern_vl": VLMTestInfo(
@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
-        # NOTE: Mono-InternVL-2B doesn't work with fp16,
-        # it will result NaN during inference.
-        # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values_videos"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        get_stop_token_ids=lambda tok: [128009],
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
        marks=[
@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
-        postprocess_inputs=model_utils.wrap_inputs_post_processor,
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
    ),
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
    ),
    "minicpmv_26": VLMTestInfo(
        models=["openbmb/MiniCPM-V-2_6"],
@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
    ),
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
-        postprocess_inputs=model_utils.molmo_post_processor,
    ),
    # Tests for phi3v currently live in another file because of a bug in
    # transformers. Once this issue is fixed, we can enable them here instead.
@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=48)],
    ),
    "qwen_vl": VLMTestInfo(
@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
        models=["facebook/chameleon-7b"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
        models=["llava-hf/llava-1.5-7b-hf"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
@ -541,6 +508,19 @@ VLM_TEST_SETTINGS = {
            limit_mm_per_prompt={"image": 4},
        )],
    ),
+    # regression test for https://github.com/vllm-project/vllm/issues/15122
+    "qwen2_5_vl-windows-attention": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
+            limit_mm_per_prompt={"image": 1},
+        )],
+    ),
 }
 # yapf: enable

--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@ -100,7 +100,6 @@ def run_test(
            distributed_executor_backend=distributed_executor_backend,
            enable_lora=True,
            max_lora_rank=320,
-            lora_extra_vocab_size=0,
            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
            enforce_eager=True,
    ) as vllm_model:
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@ -4,7 +4,6 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import json
-import uuid
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Optional

@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
 from transformers import AutoProcessor

-from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
-                  TextPrompt, TokensPrompt)
+from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sequence import Logprob, SampleLogprobs
@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
 if TYPE_CHECKING:
    from _typeshed import StrPath

-MODELS = ["mistralai/Pixtral-12B-2409"]
+PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
+MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
+
 IMG_URLS = [
    "https://picsum.photos/id/237/400/300",
    "https://picsum.photos/id/231/200/300",
@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
 FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
 assert FIXTURES_PATH.exists()

-FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
-FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
+FIXTURE_LOGPROBS_CHAT = {
+    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
+    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
+}

 OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]

@ -166,12 +170,12 @@ def test_chat(
    model: str,
    dtype: str,
 ) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
+        FIXTURE_LOGPROBS_CHAT[model])
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
-            enable_chunked_prefill=False,
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
@ -183,70 +187,40 @@ def test_chat(
            outputs.extend(output)

    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    # Remove last `None` prompt_logprobs to compare with fixture
+    for i in range(len(logprobs)):
+        assert logprobs[i][-1] is None
+        logprobs[i] = logprobs[i][:-1]
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")


-@large_gpu_test(min_gb=80)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
-    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
-    args = EngineArgs(
-        model=model,
-        tokenizer_mode="mistral",
-        enable_chunked_prefill=False,
-        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
-        dtype=dtype,
-    )
-    engine = LLMEngine.from_engine_args(args)
-
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
-
-    outputs = []
-    count = 0
-    while True:
-        out = engine.step()
-        count += 1
-        for request_output in out:
-            if request_output.finished:
-                outputs.append(request_output)
-
-        if count == 2:
-            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
-                               SAMPLING_PARAMS)
-        if not engine.has_unfinished_requests():
-            break
-
-    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
-    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
-
-
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize(
    "prompt,expected_ranges",
    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 10,
+        "offset": 11,
        "length": 494
    }]),
     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 10,
+         "offset": 11,
         "length": 266
     }, {
-         "offset": 276,
+         "offset": 277,
         "length": 1056
     }, {
-         "offset": 1332,
+         "offset": 1333,
         "length": 418
     }])])
-def test_multi_modal_placeholders(
-        vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
+def test_multi_modal_placeholders(vllm_runner, prompt,
+                                  expected_ranges: list[PlaceholderRange],
+                                  monkeypatch) -> None:
+
+    # This placeholder checking test only works with V0 engine
+    # where `multi_modal_placeholders` is returned with `RequestOutput`
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(
            "mistral-community/pixtral-12b",
            max_model_len=8192,
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union

 import torch
 from PIL.Image import Image
-from transformers import BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from vllm.config import TaskOption
@ -31,7 +30,6 @@ def run_test(
    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
    auto_cls: type[_BaseAutoModelClass],
    use_tokenizer_eos: bool,
-    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
    comparator: Callable[..., None],
    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
    stop_str: Optional[list[str]],
@ -101,7 +99,6 @@ def run_test(
    hf_model = hf_runner(model,
                         dtype=dtype,
                         auto_cls=auto_cls,
-                         postprocess_inputs=postprocess_inputs,
                         model_kwargs=hf_model_kwargs)

    # Some models need to patch things like the model processor, e.g., internvl
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@ -1,7 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 """Custom input builders for edge-cases in different models."""
+from io import BytesIO
 from typing import Callable

+import requests
+from PIL import Image
+
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
        build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
        build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
    ]
+
+
+def windows_attention_image_qwen2_5_vl():
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
+    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
+    image = Image.open(BytesIO(requests.get(image_url).content))
+
+    question = "Describe the image."
+    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
+    return build_single_image_inputs([image], [prompt], wrapped_sf)
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@ -6,16 +6,15 @@ typically specific to a small subset of models.
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, Optional, Union
+from typing import Optional, Union

 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
                          GenerationConfig)

 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 from .....conftest import HfRunner, ImageAsset, _ImageAssets
 from .types import RunnerOutput
@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
    return [asset.image_embeds for asset in image_assets]


-####### postprocessors to run on HF BatchEncoding
-def cast_dtype_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which converts a given key into a
-    target data type."""
-
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
-        return hf_inputs
-
-    return process
-
-
-def ignore_inputs_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which ignores a given key."""
-
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        del hf_inputs[hf_inp_key]
-        return hf_inputs
-
-    return process
-
-
-def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    return {"model_inputs": hf_inputs}
-
-
-def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
-    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
-
-
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
        tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            for k in inputs.keys()  # noqa
            if k not in ("seq_lens", "sft_format")
        }
-        inputs = BatchEncoding(data=inputs, tensor_type="pt")
-        return inputs
+        return BatchFeature(data=inputs, tensor_type="pt")

    hf_model.processor = processor
    hf_model.model.get_output_embeddings = lambda: \
@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model


-def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    orig_generate = hf_model.model.generate

-    def _generate(self, *args, **kwargs):
+    def _generate(
+        self,
+        *args,
+        input_ids=None,
+        pixel_values=None,
+        image_sizes=None,
+        image_bound=None,
+        tgt_sizes=None,
+        **kwargs,
+    ):
+        model_inputs = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "image_sizes": image_sizes,
+            "image_bound": image_bound,
+            "tgt_sizes": tgt_sizes,
+        }
+        for k in list(model_inputs.keys()):
+            if model_inputs[k] is None:
+                model_inputs.pop(k)
+
+        return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
        return orig_generate(*args, decode_text=False, **kwargs)

    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

    def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
        batch = {
-            k: kwargs.pop(k)
+            k: kwargs.pop(k).unsqueeze(0)
            for k in ("input_ids", "images", "image_input_idx", "image_masks")
            if k in kwargs
        }
+        batch = BatchFeature(batch).to(dtype=self.dtype)

        return self.generate_from_batch(
            batch,
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import AutoModelForCausalLM, BatchEncoding
+from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import identity

 from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
 from ....utils import check_logprobs_close
@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
    # Indicates we should explicitly pass the EOS from the tokenizer
    use_tokenizer_eos: bool = False
    auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
-    # Callable to pass to the HF runner to run on inputs; for now, we also pass
-    # the data type to input post processing, because almost all of the uses of
-    # postprocess_inputs are to fix the data types of BatchEncoding values.
-    postprocess_inputs: Callable[[BatchEncoding, str],
-                                 BatchEncoding] = identity
    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None

    # Post processors that if defined, will run oun the outputs of the
@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
    # is all combinations of .models + all fields below
    max_tokens: Union[int, tuple[int]] = 128
    num_logprobs: Union[int, tuple[int]] = 5
-    dtype: Union[str, Iterable[str]] = "half"
+    dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
    distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
    # Only expanded in video tests
    num_video_frames: Union[int, tuple[int]] = 16
@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
            "vllm_output_post_proc": self.vllm_output_post_proc,
            "auto_cls": self.auto_cls,
            "use_tokenizer_eos": self.use_tokenizer_eos,
-            "postprocess_inputs": self.postprocess_inputs,
            "comparator": self.comparator,
            "get_stop_token_ids": self.get_stop_token_ids,
            "hf_model_kwargs": self.hf_model_kwargs,
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0

-from functools import partial
 from typing import Callable

 import pytest
 import torch
+import torch.nn.functional as F
 from PIL import Image
-from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
+from transformers import Qwen2VLForConditionalGeneration

 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
    return prompt


-def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
-    return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
-
-
 def _run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
@ -118,14 +114,8 @@ def _run_test(
    with hf_runner(model,
                   dtype=dtype,
                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
-        hf_model.postprocess_inputs = partial(
-            postprocess_inputs,
-            hf_model,
-            cache_position=torch.arange(
-                0,
-                1,  # 1 for batch size
-                requires_grad=False),
-            use_cache=False)
+
+        prompts = []
        for text, image, embed_text in zip(input_texts, input_images,
                                           embed_texts):
            # dse requires non-standard input processing
@ -133,20 +123,34 @@ def _run_test(
            messages = get_messages(image, text, embed_text)
            prompt = apply_chat_template_and_add_eos(
                messages, hf_model.processor.apply_chat_template)
-            inputs = hf_model.get_inputs(
-                prompts=[[prompt]],
-                images=[[image]],
+
+            prompts.append(prompt)
+
+        all_inputs = hf_model.get_inputs(
+            prompts=prompts,
+            images=input_images,
        )
+
        with torch.no_grad():
+            all_outputs = []
+            for inputs in all_inputs:
+                inputs = hf_model.model.prepare_inputs_for_generation(
+                    **inputs,
+                    cache_position=torch.arange(1),  # 1 for batch size
+                    use_cache=False,
+                )
                outputs = hf_model.model(
-                    **hf_model.wrap_device(inputs[0],
-                                           device=hf_model.model.device.type),
+                    **hf_model.wrap_device(inputs),
                    return_dict=True,
                    output_hidden_states=True,
                )
-                pooled_output = torch.nn.functional.normalize(
-                    outputs.hidden_states[-1][0, -1], p=2, dim=-1)
-            hf_outputs.append(pooled_output.tolist())
+                pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
+                                            p=2,
+                                            dim=-1)
+
+                all_outputs.append(pooled_output.tolist())
+
+            hf_outputs = all_outputs

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@ -2,7 +2,7 @@

 import pytest
 import torch.nn.functional as F
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText

 from vllm.platforms import current_platform

@ -70,7 +70,7 @@ def _run_test(
        vllm_outputs = vllm_model.encode(input_texts, images=input_images)

    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
        # Patch the issue where generation_config.json is missing
        hf_model.processor.patch_size = \
            hf_model.model.config.vision_config.patch_size
@ -86,8 +86,7 @@ def _run_test(
        for inputs in all_inputs:
            # Based on: https://huggingface.co/royokong/e5-v
            outputs = hf_model.model(
-                **hf_model.wrap_device(inputs,
-                                       device=hf_model.model.device.type),
+                **hf_model.wrap_device(inputs),
                return_dict=True,
                output_hidden_states=True,
            )
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@ -53,8 +53,7 @@ def _run_test(
        for inputs in all_inputs:
            # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
            outputs = hf_model.model(
-                **hf_model.wrap_device(inputs,
-                                       device=hf_model.model.device.type),
+                **hf_model.wrap_device(inputs),
                return_dict=True,
                output_hidden_states=True,
            )
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@ -4,8 +4,7 @@ from typing import Optional, overload

 import pytest
 import torch
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
-                          BatchEncoding)
+from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer

 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@ -216,7 +215,6 @@ def _run_test(
                     max_num_seqs=2,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
                                          }) as vllm_model:
        vllm_outputs_per_image = [
@ -227,14 +225,10 @@ def _run_test(
            for prompts, images in inputs
        ]

-    def process(hf_inputs: BatchEncoding, **kwargs):
-        return hf_inputs
-
    with hf_runner(model,
                   dtype=dtype,
                   model_kwargs={"device_map": "auto"},
-                   postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
@ -430,7 +424,6 @@ def test_bnb_regression(
        dtype=dtype,
        max_model_len=4096,
        max_num_seqs=2,
-        enforce_eager=True,
        quantization="bitsandbytes",
        load_format="bitsandbytes",
    )
@ -486,7 +479,6 @@ def test_explicit_implicit_prompt(
        max_model_len=4096,
        max_num_seqs=2,
        tensor_parallel_size=1,
-        enforce_eager=True,
    )
    sampling_params = SamplingParams(
        temperature=0,
@ -518,7 +510,6 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
            max_model_len=4096,
            max_num_seqs=2,
            tensor_parallel_size=1,
-            enforce_eager=True,
            limit_mm_per_prompt={"image":
                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:

--- a/tests/models/fixtures/mistral_small_3_chat.json
+++ b/tests/models/fixtures/mistral_small_3_chat.json
--- a/tests/models/fixtures/pixtral_chat_engine.json
+++ b/tests/models/fixtures/pixtral_chat_engine.json
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -192,9 +192,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
                                            trust_remote_code=True),
+    "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
+                                            trust_remote_code=True),
    "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                         is_available_online=False,
                                         trust_remote_code=True),
+    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct",
+                                         min_transformers_version="4.49"),
    # [Encoder-decoder]
    "BartModel": _HfExamplesInfo("facebook/bart-base"),
    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -2,7 +2,7 @@

 import warnings
 from collections.abc import Sequence
-from typing import Optional, Union
+from typing import Any, Optional, Union

 import torch

@ -254,9 +254,9 @@ def check_logprobs_close(
 def build_model_context(
    model_id: str,
    task: TaskOption = "auto",
-    dtype: Optional[Union[str, torch.dtype]] = None,
-    mm_processor_kwargs: Optional[dict] = None,
-    limit_mm_per_prompt: Optional[dict] = None,
+    dtype: Union[str, torch.dtype] = "auto",
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    limit_mm_per_prompt: Optional[dict[str, int]] = None,
    disable_mm_preprocessor_cache: bool = True,
 ):
    """Creates an InputContext for a given model.
@ -274,9 +274,6 @@ def build_model_context(
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    if dtype is None:
-        dtype = "half"
-
    model_config = ModelConfig(
        model_id,
        task=task,
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@ -7,19 +7,25 @@ from unittest.mock import MagicMock

 import numpy as np
 import pytest
+import torch
 from transformers import ProcessorMixin

 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        PromptIndexTargets, PromptInsertion,
-                                        PromptReplacement, apply_text_matches,
+                                        ProcessingCache, PromptIndexTargets,
+                                        PromptInsertion, PromptReplacement,
+                                        apply_text_matches,
                                        apply_token_matches,
                                        find_mm_placeholders,
                                        find_text_matches, find_token_matches,
-                                        iter_token_matches)
+                                        iter_token_matches,
+                                        replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import (AnyTokenizer,
@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
    assert all(match_len == len(match_ids) for match_len in match_lens)


+# yapf: disable
+@pytest.mark.parametrize(
+    ("token_ids", "match_ids", "new_ids", "expected"),
+    [
+        ([], [], [-1], []),
+        ([], [32000], [-1], []),
+        (
+            [32000, 32000, 32000],
+            [32000],
+            [-1],
+            [-1, -1, -1],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000],
+            [-1],
+            [-1, 32000],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000, 32000],
+            [-1],
+            [-1],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000],
+            [-1],
+            [9833, -1, 32000, 32000, 9833, -1, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000, 32000, 32000],
+            [-1],
+            [9833, -1, 9833, 28747, 32000, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 0, 32000],
+            [-1],
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+        ),
+    ],
+)
+# yapf: enable
+def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
+    result = replace_token_matches(token_ids, match_ids, new_ids)
+
+    # Manually constructed results
+    assert result == expected
+
+
 # yapf: disable
@pytest.mark.parametrize(
    ("prompt", "target_by_key", "expected_by_key"),
@ -837,6 +895,45 @@ def test_find_mm_placeholders(
    assert result == expected


+def _dummy_elem(modality: str, key: str, size: int):
+    return MultiModalFieldElem(
+        modality=modality,
+        key=key,
+        data=torch.empty((size, ), dtype=torch.int8),
+        field=MultiModalSharedField(1),
+    )
+
+
+def _dummy_item(modality: str, size_by_key: dict[str, int]):
+    return MultiModalKwargsItem.from_elems([
+        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+    ])
+
+
+def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargs.from_items([
+        _dummy_item(modality, size_by_key)
+        for modality, size_by_key in size_by_key_modality.items()
+    ])
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("item", "expected_size"),
+    [
+        (_dummy_item("a", {"a1": 100}), 100),
+        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
+        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_cache_item_size(item, expected_size):
+    cache = ProcessingCache.get_lru_cache(2048, type(item))
+    cache[""] = item
+
+    assert cache.currsize == expected_size
+
+
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
    ("limit", "num_supported", "is_valid"),
@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
    )

--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,

        # Test edge cases
        (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
-        (16, 4, 8, 8192, 48, 1, 128, True),  # large prefill batch
+        (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
        (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
        (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
    ])
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@ -6,7 +6,7 @@ from vllm.core.scheduler import Scheduler
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.scheduler import Scheduler as V1Scheduler
+from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine


--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test

 models_4bit_to_test = [
    ("facebook/opt-125m", "quantize opt model inflight"),
+    ("mistralai/Mistral-7B-Instruct-v0.3",
+     "quantize inflight model with both HF and Mistral format weights")
 ]

 models_pre_qaunt_4bit_to_test = [
--- a/Show More
+++ b/Show More