merge

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
[CI/Build] Add tests for the V1 tpu_model_runner. (#14843 )
2025-10-21 07:13:52 +08:00 · 2025-03-25 10:26:20 -07:00 · 2025-03-25 12:27:16 -04:00 · 2025-03-25 07:05:39 -07:00 · 2025-03-25 13:50:49 +00:00 · 2025-03-25 11:01:58 +00:00
404 changed files with 21589 additions and 6595 deletions
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -361,7 +361,7 @@ main() {
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOG_LEVEL="WARNING"
+  export VLLM_LOGGING_LEVEL="WARNING"

  # prepare for benchmarking
  cd benchmarks || exit 1
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -82,7 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -38,6 +38,8 @@ function cpu_tests() {
    set -e
    pip install -r vllm/requirements/test.txt
    pip install -r vllm/requirements/cpu.txt
+    pytest -v -s tests/kernels/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -14,6 +14,7 @@ DOCKER_BUILDKIT=1 docker build . \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
+  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"

@ -23,6 +24,6 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and test offline inference
-docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,16 +0,0 @@
-#!/bin/bash
-
-# This script build the OpenVINO docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t openvino-test -f Dockerfile.openvino .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f openvino-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -15,13 +15,26 @@ remove_docker_container
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && export VLLM_USE_V1=1 \
+    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo TEST_1 \
+    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
+    && echo TEST_2 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && echo TEST_3 \
    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+    && echo TEST_4 \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && echo TEST_5 \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
+    && echo TEST_6 \
+    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
+
+
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -12,10 +12,11 @@ docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
+  docker rm -f "${container_name}" || true; 
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
-remove_docker_container

 # Run the image and test offline inference/tensor parallel
 docker run \
@ -25,6 +26,6 @@ docker run \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -118,7 +118,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -136,6 +136,10 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  commands:
+  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
@ -144,8 +148,8 @@ steps:
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
-  - python3 rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

 - label: Metrics, Tracing Test # 10min
@ -295,6 +299,7 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/test_pass_manager.py

 - label: PyTorch Fullgraph Test # 18min
  source_file_dependencies:
@ -510,9 +515,7 @@ steps:
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  commands:
-  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
-  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -1,28 +0,0 @@
-name: 🎲 Misc/random discussions that do not fit into the above categories.
-description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
-title: "[Misc]: "
-labels: ["misc"]
-
-body:
- type: markdown
-  attributes:
-    value: >
-      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
-  attributes:
-    label: Anything you want to discuss about vllm.
-    description: >
-      Anything you want to discuss about vllm.
-  validations:
-    required: true
- type: markdown
-  attributes:
-    value: >
-      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1 +1,5 @@
 blank_issues_enabled: false
+contact_links:
+  - name: Questions
+    url: https://discuss.vllm.ai
+    about: Ask questions and discuss with other vLLM community members
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,8 @@
 /vllm/_version.py

 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/
+vllm/vllm_flash_attn/*
+!vllm/vllm_flash_attn/fa_utils.py

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -228,6 +228,7 @@ endif()

 set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
+  "csrc/block_table.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/pos_encoding_kernels.cu"
--- a/87
+++ b/87
@ -14,17 +14,22 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -46,19 +51,22 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

 WORKDIR /workspace

+# install build and runtime dependencies
+
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi

 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/cuda.txt
+    uv pip install --system -r requirements/cuda.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -83,7 +91,7 @@ COPY requirements/build.txt requirements/build.txt
 ENV UV_HTTP_TIMEOUT=500

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt

 COPY . .
 ARG GIT_REPO_CHECK=0
@ -155,7 +163,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
@ -171,18 +179,23 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo vim \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -200,13 +213,14 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi

 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose

 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@ -221,8 +235,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl

 RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples

@ -232,7 +247,7 @@ COPY examples examples
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt

 #################### vLLM installation IMAGE ####################

@ -249,15 +264,15 @@ ENV UV_HTTP_TIMEOUT=500

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils
+    uv pip install --system -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install hf_transfer
+    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1

 # Copy in the v1 package for testing (it isn't distributed yet)
@ -282,9 +297,9 @@ ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi

 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -1,29 +0,0 @@
-# The vLLM Dockerfile is used to construct vLLM image that can be directly used
-# to run the OpenAI compatible server.
-
-FROM ubuntu:22.04 AS dev
-
-RUN apt-get update -y && \
-    apt-get install -y \
-        git python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
-WORKDIR /workspace
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN python3 -m pip install -U pip
-# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
-# build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
-
-COPY examples/ /workspace/examples
-COPY benchmarks/ /workspace/benchmarks
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -1,37 +1,267 @@
-FROM mambaorg/micromamba
-ARG MAMBA_DOCKERFILE_ACTIVATE=1
-USER root
+ARG BASE_UBI_IMAGE_TAG=9.5-1741850109

-ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
+###############################################################
+# base stage with basic dependencies
+###############################################################

-RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder

-# Some packages in requirements/cpu are installed here
-# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
-# Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv, cargo & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:/root/.cargo/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+
+# install gcc-13, python, rust, openblas
+# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
+# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
+#       when `--jobs=<N>` is passed with podman build command
+RUN microdnf install -y openssl-devel dnf \
+    && dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
+    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
+    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
+    && dnf config-manager --set-enabled crb \
+    && dnf install -y \
+       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
+       pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
+       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
+       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
+       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
+       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && dnf clean all \
+    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
+    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && cd /tmp && touch control
+
+###############################################################
+# Stage to build torch family
+###############################################################
+
+FROM base-builder AS torch-builder
+
+ARG MAX_JOBS
+ARG TORCH_VERSION=2.6.0
+ARG _GLIBCXX_USE_CXX11_ABI=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable &&  \
+    git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
+    cd pytorch && \
+    uv pip install -r requirements.txt && \
+    python setup.py develop && \
+    rm -f dist/torch*+git*whl && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
+
+ARG TORCHVISION_VERSION=0.21.0
+ARG TORCHVISION_USE_NVJPEG=0
+ARG TORCHVISION_USE_FFMPEG=0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
+    cd vision && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHVISION_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+ARG TORCHAUDIO_VERSION=2.6.0
+ARG BUILD_SOX=1
+ARG BUILD_KALDI=1
+ARG BUILD_RNNT=1
+ARG USE_FFMPEG=0
+ARG USE_ROCM=0
+ARG USE_CUDA=0
+ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
+    cd audio && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHAUDIO_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+###############################################################
+# Stage to build pyarrow
+###############################################################
+
+FROM base-builder AS arrow-builder
+
+ARG MAX_JOBS
+ARG PYARROW_PARALLEL
+ARG PYARROW_VERSION=19.0.1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
+    cd arrow/cpp && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=release \
+        -DCMAKE_INSTALL_PREFIX=/usr/local \
+        -DARROW_PYTHON=ON \
+        -DARROW_BUILD_TESTS=OFF \
+        -DARROW_JEMALLOC=ON \
+        -DARROW_BUILD_STATIC="OFF" \
+        -DARROW_PARQUET=ON \
+        .. && \
+    make install -j ${MAX_JOBS:-$(nproc)} && \
+    cd ../../python/ && \
+    uv pip install -v -r requirements-wheel-build.txt && \
+    PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
+    python setup.py build_ext \
+    --build-type=release --bundle-arrow-cpp \
+    bdist_wheel --dist-dir /arrowwheels/
+
+###############################################################
+# Stage to build opencv
+###############################################################
+
+FROM base-builder AS cv-builder
+
+ARG MAX_JOBS
+ARG OPENCV_VERSION=84
+ARG ENABLE_HEADLESS=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
+    cd opencv-python && \
+    sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
+    python -m build --wheel --installer=uv --outdir /opencvwheels/
+
+###############################################################
+# Stage to build vllm - this stage builds and installs
+# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
+# for transitive dependencies - eg. grpcio
+###############################################################
+
+FROM base-builder AS vllmcache-builder
+
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+
+ARG VLLM_TARGET_DEVICE=cpu
+
+# this step installs vllm and populates uv cache
+# with all the transitive dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,src=.,dst=/src/,rw \
+    source /opt/rh/gcc-toolset-13/enable && \
+    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
+    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
+    uv pip install pandas pythran pybind11 && \
+    # sentencepiece.pc is in some pkgconfig inside uv cache
+    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
+    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
+    cd /src/ && \
+    uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
+    uv pip install /vllmwheel/*.whl
+
+
+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
+###############################################################
+# Stage to build lapack
+###############################################################
+
+FROM base-builder AS lapack-builder
+
+ARG MAX_JOBS
+ARG LAPACK_VERSION=3.12.1
+RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
+    && cd lapack && source /opt/rh/gcc-toolset-13/enable \
+    && cmake -B build -S . \
+    && cmake --build build -j ${MAX_JOBS:-$(nproc)}
+
+
+###############################################################
+#                   FINAL VLLM IMAGE STAGE                    #
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
+
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+
+# create artificial dependencies between stages for independent stages to build in parallel
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=vllmcache-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
+COPY --from=lapack-builder /tmp/control /dev/null
+
+# install gcc-11, python, openblas, numactl, lapack
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
+    --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
+    rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    microdnf install --nodocs -y \
+    tar findutils openssl \
+    pkgconfig xsimd g++ gcc-fortran libsndfile \
+    libtiff libjpeg openjpeg2 zlib zeromq \
+    freetype lcms2 libwebp tcl tk utf8proc \
+    harfbuzz fribidi libraqm libimagequant libxcb \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && microdnf clean all \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv --no-cache \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
+    && make -C /numactl install \
+    && uv pip install cmake \
+    && cmake --install /lapack/build \
+    && uv pip uninstall cmake
+
+# consume previously built wheels (including vllm)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl

 COPY ./ /workspace/vllm
-
 WORKDIR /workspace/vllm
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

-RUN --mount=type=cache,target=/root/.cache/pip  \
-    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/cpu.txt \
-        xformers uvloop==0.20.0
-
-RUN --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py install
-
 # install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils

 WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -40,7 +40,7 @@ ARG USE_CYTHON
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
    && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@ -12,6 +12,8 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="b7d29fb"
 ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base

@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl

+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+
 ARG BASE_IMAGE
 ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
+    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,11 +1,7 @@
-FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
+# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base

-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+RUN rm /etc/apt/sources.list.d/intel-graphics.list

 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@ -21,8 +17,6 @@ RUN apt-get update -y && \
    python3 \
    python3-dev \
    python3-pip \
-    libze-intel-gpu-dev \
-    libze-intel-gpu1 \
    wget

 WORKDIR /workspace/vllm
--- a/README.md
+++ b/README.md
@ -10,17 +10,29 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

+---
+
+[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
+
+[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
+
+---
+
 *Latest News* 🔥

- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+
+<details>
+<summary>Previous News</summary>
+
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
@ -34,8 +46,9 @@ Easy, fast, and cheap LLM serving for everyone
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).

---
+</details>

+---
 ## About

 vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -143,10 +156,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Contact Us

- For technical questions and feature requests, please use GitHub issues or discussions.
- For discussing with fellow users and coordinating contributions and development, please use Slack.
- For security disclosures, please use GitHub's security advisory feature.
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)

 ## Media Kit

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -42,7 +42,7 @@ become available.
    </tr>
    <tr>
      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">🟡</td>
      <td style="text-align: center;">🟡</td>
      <td>Specify your dataset path on HuggingFace</td>
    </tr>
@ -60,8 +60,8 @@ become available.
 🚧: to be supported

 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
-formats, please consider contributing.
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.

 **Note**: VisionArena’s `dataset-name` should be set to `hf`

@ -139,6 +139,57 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts "${NUM_PROMPTS}"
 ```

+### HuggingFaceDataset Examples
+
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-subset "${DATASET_SUBSET}"
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+```
+
 ---
 ## Example - Offline Throughput Benchmark

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -63,7 +63,7 @@ async def async_request_tgi(
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
-            # TGI does not accept ignore_eos flag.
+            "ignore_eos_token": request_func_input.ignore_eos,
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -71,6 +71,10 @@ async def async_request_tgi(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None

        ttft = 0.0
        st = time.perf_counter()
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -17,6 +17,7 @@ SampleRequest instances, similar to the approach used in ShareGPT.
 import base64
 import io
 import json
+import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
@ -35,6 +36,8 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer

+logger = logging.getLogger(__name__)
+
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@ -61,9 +64,6 @@ class SampleRequest:
 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0

-    # num_requests has default 1000 in both the benchmark_serving.py and
-    # benchmark_throughput.py
-
    def __init__(
        self,
        dataset_path: Optional[str] = None,
@ -90,8 +90,8 @@ class BenchmarkDataset(ABC):
            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
-        This method is used for chat models that expect a specific 
-        conversation format.
+        This method is used for chat models that expect a specific conversation
+        format.
        """
        content = [{"text": prompt, "type": "text"}]
        if mm_content is not None:
@ -101,10 +101,10 @@ class BenchmarkDataset(ABC):
    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.
-        
+
        This method must be overridden by subclasses since the method to load
        data will vary depending on the dataset format and source.
-        
+
        Raises:
            NotImplementedError: If a subclass does not implement this method.
        """
@ -121,18 +121,18 @@ class BenchmarkDataset(ABC):
        """
        Optionally select a random LoRA request and return its associated
        tokenizer.
-        
+
        This method is used when LoRA parameters are provided.  It randomly
        selects a LoRA based on max_loras and retrieves a cached tokenizer for
        that LoRA if available. Otherwise, it returns the base tokenizer.
-        
+
        Args:
            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
            LoRA is selected.  max_loras (Optional[int]): The maximum number of
            LoRAs available. If None, LoRA is not used.  lora_path
            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
            is not used.
-        
+
        Returns:
            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
            element is a LoRARequest (or None if not applicable) and the second
@ -160,21 +160,39 @@ class BenchmarkDataset(ABC):
               num_requests: int) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.
-        
+
        Subclasses must override this method to implement dataset-specific logic
        for generating a list of SampleRequest objects.
-        
+
        Args:
            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
             for processing the dataset's text.
            num_requests (int): The number of sample requests to generate.
-        
+
        Returns:
            list[SampleRequest]: A list of sample requests generated from the
            dataset.
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+

 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@ -276,15 +294,16 @@ class RandomDataset(BenchmarkDataset):
    ) -> None:
        super().__init__(**kwargs)

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               prefix_len: int = DEFAULT_PREFIX_LEN,
-               range_ratio: float = DEFAULT_RANGE_RATIO,
-               input_len: int = DEFAULT_INPUT_LEN,
-               output_len: int = DEFAULT_OUTPUT_LEN,
-               **kwargs) -> list[SampleRequest]:
-
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
        vocab_size = tokenizer.vocab_size

        prefix_token_ids = (np.random.randint(
@ -346,20 +365,24 @@ class ShareGPTDataset(BenchmarkDataset):
        random.seed(self.random_seed)
        random.shuffle(self.data)

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               lora_path: Optional[str] = None,
-               max_loras: Optional[int] = None,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
-            prompt, completion = entry["conversations"][0]["value"],\
-                entry["conversations"][1]["value"]
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )

            lora_request, tokenizer = self.get_random_lora_request(
                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
@ -383,6 +406,7 @@ class ShareGPTDataset(BenchmarkDataset):
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
                ))
+        self.maybe_oversample_requests(samples, num_requests)
        return samples


@ -415,19 +439,20 @@ class SonnetDataset(BenchmarkDataset):
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = f.readlines()

-    def sample(self,
-               tokenizer,
-               num_requests: int,
-               prefix_len: int = DEFAULT_PREFIX_LEN,
-               input_len: int = DEFAULT_INPUT_LEN,
-               output_len: int = DEFAULT_OUTPUT_LEN,
-               return_prompt_formatted: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
        avg_len = sum(len(tokens)
-                      for tokens in \
-                        tokenized_lines) / len(tokenized_lines)
+                      for tokens in tokenized_lines) / len(tokenized_lines)

        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
@ -506,12 +531,14 @@ class BurstGPTDataset(BenchmarkDataset):
        # Convert the dataframe to a list of lists.
        return data.values.tolist()

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               max_loras: Optional[int] = None,
-               lora_path: Optional[str] = None,
-               **kwargs) -> list[SampleRequest]:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
        samples = []
        data = self._sample_loaded_data(num_requests=num_requests)
        for i in range(num_requests):
@ -544,7 +571,6 @@ class HuggingFaceDataset(BenchmarkDataset):
    Dataset class for processing a HuggingFace dataset with conversation data
    and optional images.
    """
-    DEFAULT_NUM_REQUESTS = 1000

    def __init__(
        self,
@ -618,6 +644,7 @@ class HuggingFaceDataset(BenchmarkDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -632,7 +659,6 @@ class VisionArenaDataset(HuggingFaceDataset):
    """

    DEFAULT_OUTPUT_LEN = 128
-    DEFAULT_NUM_REQUESTS = 1000
    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"

    def __init__(
@ -657,12 +683,14 @@ class VisionArenaDataset(HuggingFaceDataset):
        )
        self.data = dataset.shuffle(seed=self.random_seed)

-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
@ -685,4 +713,5 @@ class VisionArenaDataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"

-    tokenizer = get_tokenizer(tokenizer_id,
-                              trust_remote_code=args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_mode=args.tokenizer_mode,
+    )

    if args.dataset == 'grammar':
        args.structure_type = 'guided_grammar'
@ -876,6 +879,13 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
    parser.add_argument(
        "--num-prompts",
        type=int,
@ -989,11 +999,12 @@ if __name__ == "__main__":
                        type=float,
                        default=1.0,
                        help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
-                        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
-                        default="xgrammar",
-                        help="Backend to use for structured outputs")
+    parser.add_argument(
+        "--structured-output-backend",
+        type=str,
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
+        default="xgrammar",
+        help="Backend to use for structured outputs")

    args = parser.parse_args()
    main(args)
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from sglang quantization/tuning_block_wise_kernel.py
+
+import argparse
+import json
+import multiprocessing as mp
+import os
+import time
+from datetime import datetime
+from typing import Any
+
+import torch
+import tqdm
+import triton
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+mp.set_start_method("spawn", force=True)
+
+assert current_platform.is_cuda(
+), "Only support tune w8a8 block fp8 kernel on CUDA device."
+
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+}
+
+
+def w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    config: dict[str, Any],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with 
+    block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. 
+                    It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    if A.dtype == torch.float8_e4m3fn:
+        kernel = _w8a8_block_fp8_matmul
+    else:
+        raise RuntimeError(
+            "Currently, only support tune w8a8 block fp8 kernel.")
+
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
+
+
+def get_configs_compute_bound():
+    configs = []
+    for num_stages in [2, 3, 4, 5]:
+        for block_m in [16, 32, 64, 128, 256]:
+            for block_k in [64, 128]:
+                for block_n in [32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 16, 32, 64]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_m,
+                                "BLOCK_SIZE_N": block_n,
+                                "BLOCK_SIZE_K": block_k,
+                                "GROUP_SIZE_M": group_size,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+    return configs
+
+
+def get_weight_shapes(tp_size):
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
+    # Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (12288, 7168),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+    return weight_shapes
+
+
+def benchmark_config(A,
+                     B,
+                     As,
+                     Bs,
+                     block_size,
+                     config,
+                     out_dtype=torch.float16,
+                     num_iters=10):
+
+    def run():
+        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
+
+    torch.cuda.synchronize()
+    # JIT complication & warmup
+    for _ in range(5):
+        run()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        run()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    return avg
+
+
+def tune(M, N, K, block_size, out_dtype, search_space, input_type):
+    factor_for_scale = 1e-2
+
+    if input_type == "fp8":
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        A_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
+            fp8_max)
+        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        B_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
+            fp8_max)
+        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    else:
+        raise RuntimeError(
+            "Currently, only support tune w8a8 block fp8 kernel.")
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32,
+                    device="cuda") * factor_for_scale
+    Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
+          factor_for_scale)
+
+    best_config = None
+    best_time = float("inf")
+    for config in tqdm(search_space):
+        try:
+            kernel_time = benchmark_config(
+                A,
+                B,
+                As,
+                Bs,
+                block_size,
+                config,
+                out_dtype,
+                num_iters=10,
+            )
+        except triton.runtime.autotuner.OutOfResources:
+            # Some configurations may be invalid and fail to compile.
+            continue
+
+        if kernel_time < best_time:
+            best_time = kernel_time
+            best_config = config
+    now = datetime.now()
+    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
+    assert best_config is not None
+    return best_config
+
+
+def save_configs(
+    N,
+    K,
+    block_n,
+    block_k,
+    configs,
+    save_path,
+    input_type="fp8",
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = (
+        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
+        f"block_shape=[{block_n},{block_k}].json")
+
+    config_file_path = os.path.join(save_path, json_file_name)
+    print(f"Writing best config to {config_file_path}...")
+
+    with open(config_file_path, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def tune_on_gpu(args_dict):
+    """Run tuning on a specific GPU."""
+    gpu_id = args_dict["gpu_id"]
+    batch_sizes = args_dict["batch_sizes"]
+    weight_shapes = args_dict["weight_shapes"]
+    args = args_dict["args"]
+
+    torch.cuda.set_device(gpu_id)
+    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
+
+    block_n = args.block_n
+    block_k = args.block_k
+    out_dtype = DTYPE_MAP[args.out_dtype]
+    save_path = args.save_path
+    input_type = args.input_type
+
+    search_space = get_configs_compute_bound()
+    search_space = [
+        config for config in search_space
+        if block_k % config["BLOCK_SIZE_K"] == 0
+    ]
+
+    start = time.time()
+    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
+        N, K = shape[0], shape[1]
+        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
+        benchmark_results = [
+            tune(
+                batch_size,
+                N,
+                K,
+                [block_n, block_k],
+                out_dtype,
+                search_space,
+                input_type,
+            ) for batch_size in tqdm(batch_sizes,
+                                     desc=f"GPU {gpu_id} - Batch sizes")
+        ]
+        best_configs = {
+            M: config
+            for M, config in zip(batch_sizes, benchmark_results)
+        }
+        save_configs(N, K, block_n, block_k, best_configs, save_path,
+                     input_type)
+
+    end = time.time()
+    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
+
+
+def distribute_batch_sizes(batch_sizes, num_gpus):
+    """Distribute batch sizes across available GPUs."""
+    batches_per_gpu = []
+    for i in range(num_gpus):
+        start_idx = i * len(batch_sizes) // num_gpus
+        end_idx = (i + 1) * len(batch_sizes) // num_gpus
+        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
+    return batches_per_gpu
+
+
+def main(args):
+    print(args)
+    num_gpus = torch.cuda.device_count()
+    if num_gpus == 0:
+        raise RuntimeError("No GPU available for tuning")
+    print(f"Found {num_gpus} GPUs for parallel tuning")
+
+    torch.cuda.init()
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+        num_gpus = 1  # If only one batch size, use only one GPU
+
+    weight_shapes = get_weight_shapes(args.tp_size)
+
+    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
+
+    process_args = []
+    for gpu_id in range(num_gpus):
+        process_args.append({
+            "gpu_id": gpu_id,
+            "batch_sizes": batches_per_gpu[gpu_id],
+            "weight_shapes":
+            weight_shapes,  # Each GPU processes all weight shapes
+            "args": args,
+        })
+
+    ctx = mp.get_context("spawn")
+    with ctx.Pool(num_gpus) as pool:
+        pool.map(tune_on_gpu, process_args)
+
+    print("Multi-GPU tuning completed")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="""
+Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
+    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
+Then copy to model_executor/layers/quantization/utils/configs
+        """,
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--tp-size", "-tp", type=int, default=8)
+    parser.add_argument("--input-type",
+                        type=str,
+                        choices=["fp8"],
+                        default="fp8")
+    parser.add_argument(
+        "--out-dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16", "half"],
+        default="float16",
+    )
+    parser.add_argument("--block-n", type=int, default=128)
+    parser.add_argument("--block-k", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--save-path", type=str, default="./")
+    args = parser.parse_args()
+
+    main(args)
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
    --request-rate $qps \
    --result-filename "$FILENAME" \
+    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
    --port ${PORT:-8000}

  echo "Completed benchmark with QPS: $qps"
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -190,6 +190,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
+    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")

--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 
+          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/block_table.cu
+++ b/csrc/block_table.cu
@ -0,0 +1,92 @@
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+namespace vllm {
+__global__ void append_kernel(const int* __restrict__ row_indices,
+                              const int* __restrict__ cu_num_appends,
+                              const int* __restrict__ block_ids,
+                              int* __restrict__ block_table,
+                              int max_num_blocks_per_row) {
+  int bid = blockIdx.x;
+  int tgt_row = row_indices[2 * bid];
+  int tgt_offset = row_indices[2 * bid + 1];
+
+  int start = cu_num_appends[bid];
+  int end = cu_num_appends[bid + 1];
+  int length = end - start;
+  int tid = threadIdx.x;
+  int64_t offset = tgt_row * max_num_blocks_per_row + tgt_offset;
+  for (int i = tid; i < length; i += blockDim.x) {
+    block_table[offset + i] = block_ids[start + i];
+  }
+}
+
+__global__ void move_kernel(const int* __restrict__ src_dst_n,
+                            int* __restrict__ block_table,
+                            int max_num_blocks_per_row) {
+  int bid = blockIdx.x;
+  int src_row = src_dst_n[3 * bid];
+  int tgt_row = src_dst_n[3 * bid + 1];
+  int num_blocks = src_dst_n[3 * bid + 2];
+
+  int tid = threadIdx.x;
+  for (int i = tid; i < num_blocks; i += blockDim.x) {
+    block_table[tgt_row * max_num_blocks_per_row + i] =
+        block_table[src_row * max_num_blocks_per_row + i];
+  }
+}
+}  // namespace vllm
+
+void block_table_appends(
+    torch::Tensor& append_row_indices,
+    torch::Tensor& append_row_indices_cpu,
+    torch::Tensor& append_cumsums,
+    torch::Tensor& append_cumsums_cpu,
+    torch::Tensor& append_block_ids,
+    torch::Tensor& append_block_ids_cpu,
+    torch::Tensor& block_table,
+    int64_t num_appends,
+    int64_t total_num_append_blocks) {
+  int* append_row_indices_ptr = append_row_indices.data_ptr<int>();
+  const int* append_row_indices_cpu_ptr = append_row_indices_cpu.data_ptr<int>();
+  int* append_cumsums_ptr = append_cumsums.data_ptr<int>();
+  const int* append_cumsums_cpu_ptr = append_cumsums_cpu.data_ptr<int>();
+  int* append_block_ids_ptr = append_block_ids.data_ptr<int>();
+  const int* append_block_ids_cpu_ptr = append_block_ids_cpu.data_ptr<int>();
+  int* block_table_ptr = block_table.data_ptr<int>();
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(block_table));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  cudaMemcpyAsync(append_row_indices_ptr, append_row_indices_cpu_ptr,
+                  num_appends * 2 * sizeof(int), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(append_cumsums_ptr, append_cumsums_cpu_ptr,
+                  (num_appends + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(append_block_ids_ptr, append_block_ids_cpu_ptr,
+                  total_num_append_blocks * sizeof(int), cudaMemcpyHostToDevice, stream);
+
+  int64_t max_num_blocks_per_row = block_table.size(1);
+  vllm::append_kernel<<<num_appends, 1024, 0, stream>>>(
+      append_row_indices_ptr, append_cumsums_ptr, append_block_ids_ptr,
+      block_table_ptr, max_num_blocks_per_row);
+}
+
+void block_table_moves(
+    torch::Tensor& src_dst_n,
+    torch::Tensor& src_dst_n_cpu,
+    torch::Tensor& block_table,
+    int64_t num_moves) {
+  int* src_dst_n_ptr = src_dst_n.data_ptr<int>();
+  const int* src_dst_n_cpu_ptr = src_dst_n_cpu.data_ptr<int>();
+  int* block_table_ptr = block_table.data_ptr<int>();
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(block_table));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  cudaMemcpyAsync(src_dst_n_ptr, src_dst_n_cpu_ptr,
+                  num_moves * 3 * sizeof(int), cudaMemcpyHostToDevice, stream);
+
+  int64_t max_num_blocks_per_row = block_table.size(1);
+  vllm::move_kernel<<<num_moves, 1024, 0, stream>>>(
+      src_dst_n_ptr, block_table_ptr, max_num_blocks_per_row);
+}
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(

 }  // namespace vllm

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
@ -393,8 +393,8 @@ void reshape_and_cache(
                             CALL_RESHAPE_AND_CACHE)
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
@ -446,8 +446,8 @@ void reshape_and_cache_flash(
                             CALL_RESHAPE_AND_CACHE_FLASH);
 }

-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -88,6 +88,48 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace

+template <typename scalar_t>
+void concat_and_cache_mla_cpu_impl(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                      // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int num_tokens,                      //
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size                       //
+) {
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    const int64_t slot_idx = slot_mapping[token_idx];
+    // NOTE: slot_idx can be -1 if the token is padded
+    if (slot_idx < 0) {
+      continue;
+    }
+    const int64_t block_idx = slot_idx / block_size;
+    const int64_t block_offset = slot_idx % block_size;
+
+    auto copy = [&](const scalar_t* __restrict__ src,
+                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
+                    int size, int offset) {
+      for (int i = 0; i < size; i++) {
+        const int64_t src_idx = token_idx * src_stride + i;
+        const int64_t dst_idx =
+            block_idx * block_stride + block_offset * entry_stride + i + offset;
+        dst[dst_idx] = src[src_idx];
+      }
+    };
+
+    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+  }
+}
+
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.
@ -134,6 +176,38 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
  });
 }

+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  TORCH_CHECK(kv_cache_dtype != "fp8");
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
+        concat_and_cache_mla_cpu_impl<scalar_t>(
+            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
+            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
+            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
+            kv_lora_rank, pe_dim, block_size);
+        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
+      });
+}
+
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping) {
  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -130,6 +130,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {

  __m512i reg;

+  explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
+
  explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}

  explicit BF16Vec32(__m512i data) : reg(data) {}
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@ -0,0 +1,393 @@
+#include "cpu_types.hpp"
+#include <float.h>
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using qk_load_vec_type = void;
+  using qk_vec_type = void;
+  using v_load_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power and s390x architecture-specific vector types
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures, including x86
+  using qk_load_vec_type = vec_op::FP16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+};
+
+#ifdef __AVX512BF16__
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec32;
+  using qk_vec_type = vec_op::BF16Vec32;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+// pass
+#else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#endif
+
+template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
+          typename qk_vec_type>
+void mla_decode_block_head(
+    const qk_vec_type* __restrict__ q_vecs,          // [HEAD_UNROLL, head_dim]
+    const qk_vec_type* __restrict__ k_vecs,          // [block_size, head_dim]
+    const vec_op::FP32Vec16* __restrict v_vecs_f32,  // [block_size, v_head_dim]
+    float* __restrict__ acc_out,  // [HEAD_UNROLL, v_head_dim]
+    float* __restrict__ acc_lse,  // [HEAD_UNROLL]
+    const float scale, const int num_tokens) {
+  using f32_vec_type = vec_op::FP32Vec16;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
+
+  float logits[BLOCK_SIZE][HEAD_UNROLL] = {};  // initialize to zeros
+  float max_val[HEAD_UNROLL];
+  std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
+
+  f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
+  for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
+    // load to registers
+    qk_vec_type q_vec[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      q_vec[unroll] =
+          qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+      qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
+    }
+  }
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
+      logits[block_offset][unroll] = acc;
+      max_val[unroll] = std::max(max_val[unroll], acc);
+    }
+  }
+
+  float sum_exp[HEAD_UNROLL] = {};
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float val =
+          std::exp(logits[block_offset][unroll] - max_val[unroll]);
+      logits[block_offset][unroll] = val;
+      sum_exp[unroll] += val;
+    }
+  }
+
+  f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+    // load to registers
+    f32_vec_type scale_[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      scale_[unroll] =
+          f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
+
+    for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+      f32_vec_type v_vec(
+          v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
+    }
+  }
+
+  // merge attention state
+  // section 2.2 in https://arxiv.org/pdf/2501.01005
+  f32_vec_type prev_scale[HEAD_UNROLL];
+  f32_vec_type curr_scale[HEAD_UNROLL];
+
+#pragma unroll
+  for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+    const float prev_lse = acc_lse[unroll];
+    const float curr_lse = std::log(sum_exp[unroll]) +
+                           max_val[unroll];  // add back max_val to get true lse
+    // softmax trick
+    const float max_lse = std::max(prev_lse, curr_lse);
+    const float prev_sum_exp = std::exp(prev_lse - max_lse);
+    const float curr_sum_exp = std::exp(curr_lse - max_lse);
+
+    const float new_sum_exp = prev_sum_exp + curr_sum_exp;
+    acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
+
+    prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
+    curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
+  }
+
+  for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
+      o_vec = o_vec * prev_scale[unroll] +
+              this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
+      o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
+    }
+  }
+
+  q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
+  acc_out += V_HEAD_DIM * HEAD_UNROLL;
+}
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
+          typename qk_vec_type>
+void mla_decode_block(
+    const qk_vec_type* __restrict__ q_vecs,  // [num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,   // [block_size, head_dim]
+    float* __restrict__ acc_out,             // [num_heads, v_head_dim]
+    float* __restrict__ acc_lse,             // [num_heads]
+    const int num_heads, const float scale, const int num_tokens) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  static_assert(
+      std::is_same<qk_vec_type,
+                   typename KernelVecType<scalar_t>::qk_vec_type>::value);
+  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+  using f32_vec_type = vec_op::FP32Vec16;
+  static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
+  static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
+
+  const qk_vec_type* k_vecs;
+  const f32_vec_type* v_vecs_f32;
+  float* kv_cache_f32 = nullptr;
+
+  if constexpr (!std::is_same<scalar_t, float>::value) {
+    // convert KV cache block to FP32 to reuse it across query heads and
+    // attn @ V computation, since FP16/BF16->FP32 is expensive.
+    // TODO: move malloc outside of this fn to reuse across iterations.
+    const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
+    kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
+      for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
+        v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
+        f32_vec_type kv_vec_f32(kv_load_vec);
+        kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
+      }
+
+    if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
+      // for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
+      // NOTE: in this case, we only need to convert the V section to FP32.
+      // But for simplicity, we will convert the whole KV block to FP32.
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    } else {
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
+    }
+
+    // attn @ V always use FP32 for V, since attn is FP32.
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
+
+  } else {
+    // KV cache is FP32. don't need to do anything.
+    k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
+  }
+
+  // compute 2 heads at the same time to improve ILP and
+  // take advantage of register cache for K and V.
+  constexpr int HEAD_UNROLL = 2;
+  for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
+    acc_out += HEAD_UNROLL * V_HEAD_DIM;
+    acc_lse += HEAD_UNROLL;
+  }
+
+  // take care of the remaining heads
+  for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_DIM / QK_NUM_ELEM;
+    acc_out += V_HEAD_DIM;
+    acc_lse += 1;
+  }
+
+  if (kv_cache_f32 != nullptr) {
+    std::free(kv_cache_f32);
+  }
+}
+}  // namespace
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
+void mla_decode_kvcache_cpu_impl(
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, v_head_dim]
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size,
+                                            // head_dim]
+    const int num_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
+    const int kv_stride, const int num_seqs) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+
+  // shared across threads
+  const int max_threads = omp_get_max_threads();
+  const int acc_out_nbytes =
+      max_threads * num_heads * V_HEAD_DIM * sizeof(float);
+  float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
+  std::vector<float> acc_lse(max_threads * num_heads);
+
+  // allocate memory to pre-convert query to FP32 later
+  float* q_f32;
+  constexpr bool PRE_CONVERT_QUERY =
+      !std::is_same<scalar_t, float>::value &&
+      std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
+  if constexpr (PRE_CONVERT_QUERY) {
+    const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
+    q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
+  }
+
+#pragma omp parallel
+  {
+    const int num_threads = omp_get_num_threads();
+    const int thread_id = omp_get_thread_num();
+    float* __restrict__ acc_out_thread =
+        acc_out + thread_id * num_heads * V_HEAD_DIM;
+    float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
+
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      // reset accumulator
+      std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
+      std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
+
+      const int seq_len = seq_lens[seq_idx];
+      const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
+
+      const qk_vec_type* q_vecs;
+      if constexpr (PRE_CONVERT_QUERY) {
+// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
+#pragma omp for
+        for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
+          qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
+          qk_vec_type q_vec(q_load_vec);
+          q_vec.save(q_f32 + i);
+        }
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
+      } else {
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
+      }
+
+#pragma omp for
+      for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+        const int physical_block_idx =
+            block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
+        const int num_tokens =
+            block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
+
+        mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
+            q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
+            acc_lse_thread, num_heads, scale, num_tokens);
+      }
+
+// merge attention states across threads
+// section 2.2 in https://arxiv.org/pdf/2501.01005
+// each thread is responsible for 1 head
+#pragma omp for
+      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+        float* acc_lse_head = acc_lse.data() + head_idx;
+        float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
+
+        float max_val = -FLT_MAX;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
+        }
+
+        float sum_exp = 0.0f;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
+          acc_lse_head[thread_id_ * num_heads] = val;
+          sum_exp += val;
+        }
+
+        float inv_sum = 1.0f / sum_exp;
+        float out_head[V_HEAD_DIM] = {};
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
+          for (int i = 0; i < V_HEAD_DIM; ++i) {
+            out_head[i] +=
+                acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
+          }
+        }
+
+        for (int i = 0; i < V_HEAD_DIM; ++i) {
+          vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
+                                             head_idx * V_HEAD_DIM + i);
+        }
+      }
+    }
+  }
+  if (PRE_CONVERT_QUERY) {
+    std::free(q_f32);
+  }
+  std::free(acc_out);
+}
+
+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens) {
+  const int num_seqs = query.size(0);
+  const int num_heads = query.size(1);
+  const int head_dim = query.size(2);
+  const int block_size = kv_cache.size(1);
+  const int v_head_dim = out.size(2);
+
+  const int max_num_blocks_per_seq = block_tables.size(1);
+  const int o_stride = out.stride(0);
+  const int q_stride = query.stride(0);
+  const int kv_stride = kv_cache.stride(0);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
+        if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
+          mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
+              out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
+              kv_cache.data_ptr<scalar_t>(), num_heads, scale,
+              block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
+              max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
+        else
+          TORCH_CHECK(false, "Unsupported block size: ", block_size);
+        CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
+      });
+}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -18,6 +18,10 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                        const std::optional<torch::Tensor>& azp,
                        const std::optional<torch::Tensor>& bias);

+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops

@ -150,6 +154,14 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                  str kv_cache_dtype,"
      "                  Tensor k_scale, Tensor v_scale) -> ()");
  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
+
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
@ -157,4 +169,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }

+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
+  cpu_ops.def(
+      "mla_decode_kvcache("
+      "   Tensor! out, Tensor query, Tensor kv_cache,"
+      "   float scale, Tensor block_tables, Tensor seq_lens) -> ()");
+  cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
+}
+
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -119,6 +119,18 @@ void advance_step_flashinfer(
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);

+void block_table_appends(torch::Tensor& append_row_indices,
+                         torch::Tensor& append_row_indices_cpu,
+                         torch::Tensor& append_cumsums,
+                         torch::Tensor& append_cumsums_cpu,
+                         torch::Tensor& append_block_ids,
+                         torch::Tensor& append_block_ids_cpu,
+                         torch::Tensor& block_table, int64_t num_appends,
+                         int64_t total_num_append_blocks);
+
+void block_table_moves(torch::Tensor& src_dst_n, torch::Tensor& src_dst_n_cpu,
+                       torch::Tensor& block_table, int64_t num_moves);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@ -24,7 +24,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  // sum of squares
  float ss = 0.0f;

-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -58,7 +58,7 @@ __device__ void compute_dynamic_per_token_scales(
  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};

  float block_absmax_val_maybe = 0.0f;
-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -103,7 +103,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;

-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
@ -142,7 +142,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  int32_t const num_vec_elems = hidden_size >> 2;

 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> in = vec_input[i];

    vec4_t<float> x;
@ -206,7 +206,7 @@ __device__ void compute_dynamic_per_token_scales(
  float block_absmax_val_maybe = 0.0f;

 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> in = vec_input[i];
    vec4_t<scalar_t> const w = vec_weight[i];

@ -286,7 +286,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
 //  replace scaled_fp8_conversion_vec
 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> const in = vec_input[i];
    vec4_t<scalar_t> const w = vec_weight[i];

--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
 template<typename dst_t>
 static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_q2_K * x = (const block_q2_K *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int n   = tid/32;
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;
@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;

-    const int r = threadIdx.x/4;
+    const auto r = threadIdx.x/4;
    const int tid = r/2;
    const int is0 = r%2;
    const int l0 = 16*is0 + 4*(threadIdx.x%4);
@ -164,10 +164,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q4_K * x = (const block_q4_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 32 threads
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int is  = 2*il;
@ -197,10 +197,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q5_K * x = (const block_q5_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/16;   // il is in 0...3
    const int ir  = tid%16;   // ir is in 0...15
    const int is  = 2*il;     // is is in 0...6
@ -231,10 +231,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q6_K * x = (const block_q6_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int ip  = tid/32;   // ip is 0 or 1
    const int il  = tid - 32*ip; // 0...32
    const int is  = 8*ip + il/16;
@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_s * x = (const block_iq2_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_s * x = (const block_iq3_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst

 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_xs * x = (const block_iq4_xs *)vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@ -565,4 +565,4 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
        default:
            return nullptr;
    }
-}
+}
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -19,11 +19,11 @@ template <typename scalar_t>
 static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
                                     void* __restrict__ vy, const int kx,
                                     const int kx_padded) {
-  const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
  if (ix >= kx_padded) {
    return;
  }
-  const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
  const int i_padded = iy * kx_padded + ix;

  block_q8_1* y = (block_q8_1*)vy;
@ -375,25 +375,25 @@ torch::Tensor ggml_moe_a8(torch::Tensor X,  // input
 int64_t ggml_moe_get_block_size(int64_t type) {
  switch (type) {
    case 2:
-      return MMQ_X_Q4_0;
+      return MOE_X_Q4_0;
    case 3:
-      return MMQ_X_Q4_1;
+      return MOE_X_Q4_1;
    case 6:
-      return MMQ_X_Q5_0;
+      return MOE_X_Q5_0;
    case 7:
-      return MMQ_X_Q5_1;
+      return MOE_X_Q5_1;
    case 8:
-      return MMQ_X_Q8_0;
+      return MOE_X_Q8_0;
    case 10:
-      return MMQ_X_Q2_K;
+      return MOE_X_Q2_K;
    case 11:
-      return MMQ_X_Q3_K;
+      return MOE_X_Q3_K;
    case 12:
-      return MMQ_X_Q4_K;
+      return MOE_X_Q4_K;
    case 13:
-      return MMQ_X_Q5_K;
+      return MOE_X_Q5_K;
    case 14:
-      return MMQ_X_Q6_K;
+      return MOE_X_Q6_K;
  }
  return 0;
 }
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(

    const int & ncols_dst = ncols_y;

-    const int row_dst_0 = blockIdx.x*mmq_y;
+    const auto row_dst_0 = blockIdx.x*mmq_y;
    const int & row_x_0 = row_dst_0;

-    const int col_dst_0 = blockIdx.y*mmq_x;
+    const auto col_dst_0 = blockIdx.y*mmq_x;
    const int & col_y_0 = col_dst_0;

    int   * tile_x_ql = nullptr;
@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
-            const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
+            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
            const int kbxd = kqs / QI8_1;

 #pragma unroll
@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
 #pragma unroll
            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
+                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);

                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
+        const auto col_dst = col_dst_0 + j + threadIdx.y;
        if (col_dst >= ncols_dst) {
            return;
        }

 #pragma unroll
        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
+            const auto row_dst = row_dst_0 + threadIdx.x + i;
            if (row_dst >= nrows_dst) {
                continue;
            }
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@ -1,7 +1,7 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
 template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const auto row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

-    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
        const int ibx = row*blocks_per_row + i; // x block index

        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(

  const int ncols_dst = ncols_y * top_k;

-  const int row_dst_0 = blockIdx.x * mmq_y;
+  const auto row_dst_0 = blockIdx.x * mmq_y;
  const int& row_x_0 = row_dst_0;

-  const int col_dst_0 = blockIdx.y * mmq_x;
+  const auto col_dst_0 = blockIdx.y * mmq_x;

  int token_offs[mmq_x / nwarps];
  for (int i = 0; i < mmq_x; i += nwarps) {
@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
    const int n_per_r = ((qk * blocks_per_warp) / qr);
 #pragma unroll
    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
-      const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
      const int kbxd = kqs / QI8_1;

 #pragma unroll
@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
      }

      if (threadIdx.x < n_per_r / QK8_1) {
-        const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
        const int col_y_eff = token_offs[threadIdx.y] / top_k;
        const int block_x =
            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(

 #pragma unroll
    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-      const int row_dst = row_dst_0 + threadIdx.x + i;
+      const auto row_dst = row_dst_0 + threadIdx.x + i;
      if (row_dst >= nrows_dst) {
        continue;
      }
@ -129,12 +129,12 @@ static __device__ __forceinline__ void moe_q(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_0 64
-  #define MMQ_Y_Q4_0 128
+  #define MOE_X_Q4_0 64
+  #define MOE_Y_Q4_0 128
  #define NWARPS_Q4_0 8
 #else
-  #define MMQ_X_Q4_0 4
-  #define MMQ_Y_Q4_0 32
+  #define MOE_X_Q4_0 4
+  #define MOE_Y_Q4_0 32
  #define NWARPS_Q4_0 4
 #endif

@ -149,8 +149,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q4_0;
-  const int mmq_y = MMQ_Y_Q4_0;
+  const int mmq_x = MOE_X_Q4_0;
+  const int mmq_y = MOE_Y_Q4_0;
  const int nwarps = NWARPS_Q4_0;

  moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
@ -167,8 +167,8 @@ static void ggml_moe_q4_0_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MMQ_X_Q4_0;
-  int mmq_y = MMQ_Y_Q4_0;
+  int mmq_x = MOE_X_Q4_0;
+  int mmq_y = MOE_Y_Q4_0;
  int nwarps = NWARPS_Q4_0;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -190,12 +190,12 @@ static void ggml_moe_q4_0_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_1 64
-  #define MMQ_Y_Q4_1 128
+  #define MOE_X_Q4_1 64
+  #define MOE_Y_Q4_1 128
  #define NWARPS_Q4_1 8
 #else
-  #define MMQ_X_Q4_1 4
-  #define MMQ_Y_Q4_1 32
+  #define MOE_X_Q4_1 4
+  #define MOE_Y_Q4_1 32
  #define NWARPS_Q4_1 4
 #endif

@ -210,8 +210,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q4_1;
-  const int mmq_y = MMQ_Y_Q4_1;
+  const int mmq_x = MOE_X_Q4_1;
+  const int mmq_y = MOE_Y_Q4_1;
  const int nwarps = NWARPS_Q4_1;

  moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
@ -228,8 +228,8 @@ static void ggml_moe_q4_1_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MMQ_X_Q4_1;
-  int mmq_y = MMQ_Y_Q4_1;
+  int mmq_x = MOE_X_Q4_1;
+  int mmq_y = MOE_Y_Q4_1;
  int nwarps = NWARPS_Q4_1;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -251,12 +251,12 @@ static void ggml_moe_q4_1_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_0 64
-  #define MMQ_Y_Q5_0 128
+  #define MOE_X_Q5_0 64
+  #define MOE_Y_Q5_0 128
  #define NWARPS_Q5_0 8
 #else
-  #define MMQ_X_Q5_0 4
-  #define MMQ_Y_Q5_0 32
+  #define MOE_X_Q5_0 4
+  #define MOE_Y_Q5_0 32
  #define NWARPS_Q5_0 4
 #endif

@ -271,8 +271,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q5_0;
-  const int mmq_y = MMQ_Y_Q5_0;
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
  const int nwarps = NWARPS_Q5_0;

  moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
@ -289,8 +289,8 @@ static void ggml_moe_q5_0_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_0;
-  const int mmq_y = MMQ_Y_Q5_0;
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
  const int nwarps = NWARPS_Q5_0;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -312,12 +312,12 @@ static void ggml_moe_q5_0_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_1 64
-  #define MMQ_Y_Q5_1 128
+  #define MOE_X_Q5_1 64
+  #define MOE_Y_Q5_1 128
  #define NWARPS_Q5_1 8
 #else
-  #define MMQ_X_Q5_1 4
-  #define MMQ_Y_Q5_1 32
+  #define MOE_X_Q5_1 4
+  #define MOE_Y_Q5_1 32
  #define NWARPS_Q5_1 4
 #endif

@ -332,8 +332,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q5_1;
-  const int mmq_y = MMQ_Y_Q5_1;
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
  const int nwarps = NWARPS_Q5_1;

  moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
@ -350,8 +350,8 @@ static void ggml_moe_q5_1_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_1;
-  const int mmq_y = MMQ_Y_Q5_1;
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
  const int nwarps = NWARPS_Q5_1;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -373,12 +373,12 @@ static void ggml_moe_q5_1_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q8_0 64
-  #define MMQ_Y_Q8_0 128
+  #define MOE_X_Q8_0 64
+  #define MOE_Y_Q8_0 128
  #define NWARPS_Q8_0 8
 #else
-  #define MMQ_X_Q8_0 4
-  #define MMQ_Y_Q8_0 32
+  #define MOE_X_Q8_0 4
+  #define MOE_Y_Q8_0 32
  #define NWARPS_Q8_0 4
 #endif

@ -393,8 +393,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q8_0;
-  const int mmq_y = MMQ_Y_Q8_0;
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
  const int nwarps = NWARPS_Q8_0;

  moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
@ -411,8 +411,8 @@ static void ggml_moe_q8_0_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q8_0;
-  const int mmq_y = MMQ_Y_Q8_0;
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
  const int nwarps = NWARPS_Q8_0;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -434,12 +434,12 @@ static void ggml_moe_q8_0_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q2_K 64
-  #define MMQ_Y_Q2_K 128
+  #define MOE_X_Q2_K 64
+  #define MOE_Y_Q2_K 128
  #define NWARPS_Q2_K 8
 #else
-  #define MMQ_X_Q2_K 4
-  #define MMQ_Y_Q2_K 32
+  #define MOE_X_Q2_K 4
+  #define MOE_Y_Q2_K 32
  #define NWARPS_Q2_K 4
 #endif

@ -454,8 +454,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q2_K;
-  const int mmq_y = MMQ_Y_Q2_K;
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
  const int nwarps = NWARPS_Q2_K;

  moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
@ -472,8 +472,8 @@ static void ggml_moe_q2_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q2_K;
-  const int mmq_y = MMQ_Y_Q2_K;
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
  const int nwarps = NWARPS_Q2_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -495,12 +495,12 @@ static void ggml_moe_q2_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q3_K 64
-  #define MMQ_Y_Q3_K 128
+  #define MOE_X_Q3_K 64
+  #define MOE_Y_Q3_K 128
  #define NWARPS_Q3_K 8
 #else
-  #define MMQ_X_Q3_K 4
-  #define MMQ_Y_Q3_K 32
+  #define MOE_X_Q3_K 4
+  #define MOE_Y_Q3_K 32
  #define NWARPS_Q3_K 4
 #endif

@ -516,8 +516,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {

-  const int mmq_x = MMQ_X_Q3_K;
-  const int mmq_y = MMQ_Y_Q3_K;
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
  const int nwarps = NWARPS_Q3_K;

  moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
@ -533,8 +533,8 @@ static void ggml_moe_q3_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q3_K;
-  const int mmq_y = MMQ_Y_Q3_K;
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
  const int nwarps = NWARPS_Q3_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -556,12 +556,12 @@ static void ggml_moe_q3_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_K 64
-  #define MMQ_Y_Q4_K 128
+  #define MOE_X_Q4_K 64
+  #define MOE_Y_Q4_K 128
  #define NWARPS_Q4_K 8
 #else
-  #define MMQ_X_Q4_K 4
-  #define MMQ_Y_Q4_K 32
+  #define MOE_X_Q4_K 4
+  #define MOE_Y_Q4_K 32
  #define NWARPS_Q4_K 4
 #endif

@ -576,8 +576,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q4_K;
-  const int mmq_y = MMQ_Y_Q4_K;
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
  const int nwarps = NWARPS_Q4_K;

  moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
@ -594,8 +594,8 @@ static void ggml_moe_q4_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q4_K;
-  const int mmq_y = MMQ_Y_Q4_K;
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
  const int nwarps = NWARPS_Q4_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -617,12 +617,12 @@ static void ggml_moe_q4_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_K 64
-  #define MMQ_Y_Q5_K 128
+  #define MOE_X_Q5_K 64
+  #define MOE_Y_Q5_K 128
  #define NWARPS_Q5_K 8
 #else
-  #define MMQ_X_Q5_K 4
-  #define MMQ_Y_Q5_K 32
+  #define MOE_X_Q5_K 4
+  #define MOE_Y_Q5_K 32
  #define NWARPS_Q5_K 4
 #endif

@ -637,8 +637,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q5_K;
-  const int mmq_y = MMQ_Y_Q5_K;
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
  const int nwarps = NWARPS_Q5_K;

  moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
@ -655,8 +655,8 @@ static void ggml_moe_q5_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_K;
-  const int mmq_y = MMQ_Y_Q5_K;
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
  const int nwarps = NWARPS_Q5_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -678,12 +678,12 @@ static void ggml_moe_q5_K_q8_1_cuda(
 }

 #if defined(USE_ROCM)
-  #define MMQ_X_Q6_K 64
-  #define MMQ_Y_Q6_K 128
+  #define MOE_X_Q6_K 64
+  #define MOE_Y_Q6_K 128
  #define NWARPS_Q6_K 8
 #else
-  #define MMQ_X_Q6_K 4
-  #define MMQ_Y_Q6_K 32
+  #define MOE_X_Q6_K 4
+  #define MOE_Y_Q6_K 32
  #define NWARPS_Q6_K 4
 #endif

@ -698,8 +698,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
             const int exp_stride, const int ncols_x, const int nrows_x,
             const int ncols_y, const int nrows_y, const int nrows_dst,
             const int top_k) {
-  const int mmq_x = MMQ_X_Q6_K;
-  const int mmq_y = MMQ_Y_Q6_K;
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
  const int nwarps = NWARPS_Q6_K;

  moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
@ -716,8 +716,8 @@ static void ggml_moe_q6_K_q8_1_cuda(
    const int exp_stride, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q6_K;
-  const int mmq_y = MMQ_Y_Q6_K;
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
  const int nwarps = NWARPS_Q6_K;

  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@ -199,12 +199,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -337,12 +337,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -458,12 +458,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -586,12 +586,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@ -765,14 +765,14 @@ __global__ void reconstruct_exllama_8bit_kernel(
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -862,14 +862,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -967,14 +967,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -1065,14 +1065,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  // Preload remapping table
  __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;

  if (b_q_perm) {
    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@ -1181,11 +1181,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
  int zero_width = width / 8;
  int vec_height = height * 4;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 8;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
@ -1197,8 +1197,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
  }

  __shared__ half2 deq2[256][8];
-  int val = threadIdx.x / 8;
-  int off = threadIdx.x % 8;
+  auto val = threadIdx.x / 8;
+  auto off = threadIdx.x % 8;
  for (; val < 256; val += BLOCK_KN_SIZE / 8) {
    deq2[val][off] =
        __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
@ -1280,11 +1280,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
  int zero_width = width / 4;
  int vec_height = height * 2;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 4;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
@ -1393,8 +1393,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
                                        half* __restrict__ out) {
  // Start of block

-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32 / bit;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32 / bit;
  if (column >= width) return;

  // Views
@ -1425,8 +1425,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
    const int height, const int width, const int group,
    half* __restrict__ out) {
  // Start of block
-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32;
  if (column >= width) return;

  // Views
@ -1542,7 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,

 __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1555,7 +1555,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1568,7 +1568,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1581,7 +1581,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,

 __global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
  if (n >= size_n) return;
  int k = 0;
  uint32_t* b_ptr = b_q_weight + n;
@ -1599,9 +1599,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 3;
  uint64_t dst = 0;

@ -1630,9 +1630,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 4;
  uint64_t dst = 0;

@ -1658,10 +1658,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
                                            uint32_t* __restrict__ w_new,
                                            const int* __restrict__ q_perm,
                                            const int w_width) {
-  int w_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w_column >= w_width) return;
-  int w_new_row = blockIdx.y * 3;
-  int q_perm_idx = blockIdx.y << 5;
+  auto w_new_row = blockIdx.y * 3;
+  auto q_perm_idx = blockIdx.y << 5;
  uint32_t dst[3] = {0, 0, 0};

 #pragma unroll
@ -1744,9 +1744,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
  const uint64_t* w2 = (uint64_t*)w;
  uint64_t* w_new2 = (uint64_t*)w_new;
  int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
  if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
  int q_perm_idx = w_new2_row << 2;
  uint64_t dst = 0;

--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
                            blockIdx.z * params.SplitK * 4;

-    const int lane_id = threadIdx.x % WARP_SIZE;
+    const auto lane_id = threadIdx.x % WARP_SIZE;

    // For matrix A, a block load/store Mtile(row) x 32(col) elements in
    // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
-    const int Aldg_row_base_idx = threadIdx.x / 4;
+    const auto Aldg_row_base_idx = threadIdx.x / 4;
    Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
    const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;

@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
    // * 128(col) per iter
    Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
-    const int Bldg_row_base_idx = threadIdx.x / 8;
+    const auto Bldg_row_base_idx = threadIdx.x / 8;
    const int Bldg_base_offset =
        Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;

@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    B_ldg_guard = 0;
  #pragma unroll
    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
-      int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
+      auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
      if (m_idx < params.M) {
        A_ldg_guard |= (1u << i);
      }
@ -98,8 +98,8 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
    const int N_padded = (params.N + 31) / 32 * 32;
  #pragma unroll
    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
-      int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
-                  i * N_SIZE_ONE_LOAD;
+      auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
+                   i * N_SIZE_ONE_LOAD;
      if (n_idx < N_padded) {
        B_ldg_guard |= (1u << i);
      }
@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
  __device__ void fused_splitk_reduce() {
    // need splitk-reduce if enable splitk
    if (gridDim.z > 1) {
-      int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
+      auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
      // Wait for all previous blocks in the splitk direction to accumulate the
      // results into C_tmp
      if (threadIdx.x == 0) {
@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      }
      __syncthreads();

-      int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
+      auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
      if (blockIdx.z != 0) {
        // expecting that temporary register here reuses the previous A&B frag
        // register
@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {

    FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
    // C_tile lds and stg
-    int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
+    auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
    bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
    if (WARP_NTILE == 32) {
      int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
@ -580,9 +580,9 @@ __global__ void __launch_bounds__(BLOCK)
  int sts_stage_idx = 0;
  int lds_stage_idx = 0;

-  int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
-                       ? params.SplitK
-                       : params.K - blockIdx.z * params.SplitK;
+  auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
+                        ? params.SplitK
+                        : params.K - blockIdx.z * params.SplitK;
  int k_tiles = (tb_k_slice + 31) / 32;
  int first_k_tile = tb_k_slice - (k_tiles - 1) * 32;

@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
    const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
    const int N_32align, const int N, const int K) {
  __shared__ FT smem[64 * 32];
-  int warp_id = threadIdx.x / 32;
-  int lane_id = threadIdx.x % 32;
-  const int src_row_idx = blockIdx.x * 8 + lane_id / 4;
+  auto warp_id = threadIdx.x / 32;
+  auto lane_id = threadIdx.x % 32;
+  const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
  const int src_col_idx =
      blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
  const int src_offset = src_row_idx * K * 4 + src_col_idx;
-  int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
+  auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;

  QT qval_reg[16];
  const QT* pdata = qdata + src_offset;
@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
        *reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
  }

-  const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
-  const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
+  const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
+  const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
  #pragma unroll
  for (int i = 0; i < 2; ++i) {
    int dst_row_kidx = dst_row_base_kidx + i * 32;
@ -1008,4 +1008,4 @@ torch::Tensor allspark_w8a16_gemm(

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm);
-}
+}
--- a/csrc/quantization/gptq_allspark/allspark_repack.cu
+++ b/csrc/quantization/gptq_allspark/allspark_repack.cu
@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128)
        const uint8_t* B, const FType* B_scale, const FType* B_zero,
        uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
        const int K, const int N, const int N_32align) {
-  const int lane_id = threadIdx.x % 32;
-  const int warp_id = threadIdx.x / 32;
+  const auto lane_id = threadIdx.x % 32;
+  const auto warp_id = threadIdx.x / 32;

  if (blockIdx.x != gridDim.x - 1) {
    // Load B
@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128)
    }

    // Store B
-    const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
+    const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
    const int dst_col_idx =
        blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
    for (int i = 0; i < 8; ++i) {
@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128)
  } else {
    // Load B_scale and B_zero
    FType b_scale_reg, b_zero_reg;
-    int src_offset = blockIdx.y * 128 + threadIdx.x;
+    auto src_offset = blockIdx.y * 128 + threadIdx.x;
    ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
    if (B_zero != nullptr)
      ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@ -62,7 +62,7 @@ template <typename FType, int BLOCK, int N_MATRIX>
 __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
                                              uint32_t n, uint32_t n_matrix,
                                              uint32_t matrix_size) {
-  int idx = blockIdx.x * BLOCK + threadIdx.x;
+  auto idx = blockIdx.x * BLOCK + threadIdx.x;

  if (idx >= matrix_size) {
    return;
@ -407,4 +407,4 @@ static __device__ half2 inline num2num2(const half x) {
  return __half2half2(x);
 }

-}  // namespace allspark
+}  // namespace allspark
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@ -14,7 +14,7 @@ __global__ void awq_marlin_repack_kernel(
  int n_tiles = size_n / tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);

-  int start_k_tile = blockIdx.x * block_k_tiles;
+  auto start_k_tile = blockIdx.x * block_k_tiles;
  if (start_k_tile >= k_tiles) {
    return;
  }
@ -51,8 +51,8 @@ __global__ void awq_marlin_repack_kernel(
    int4* sh_ptr = sh + stage_size * pipe;

    if (threadIdx.x < stage_size) {
-      int k_id = threadIdx.x / stage_n_threads;
-      int n_id = threadIdx.x % stage_n_threads;
+      auto k_id = threadIdx.x / stage_n_threads;
+      auto n_id = threadIdx.x % stage_n_threads;

      int first_k = k_tile_id * tile_k_size;

@ -70,8 +70,8 @@ __global__ void awq_marlin_repack_kernel(
      return;
    }

-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;

    if (warp_id >= 4) {
      return;
@ -265,4 +265,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
-}
+}
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -42,7 +42,7 @@ namespace marlin {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {}
+                                    int size_k, int lda, int block_rows) {}

 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
@ -459,29 +459,32 @@ __device__ inline void barrier_release(int* lock, bool reset = false) {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {
-  int start_row = block_rows * blockIdx.x;
+                                    int size_k, int lda, int block_rows) {
+  auto start_row = block_rows * blockIdx.x;
  int finish_row = start_row + block_rows;
  if (finish_row > size_m) {
    finish_row = size_m;
  }
  int cur_block_rows = finish_row - start_row;

-  int row_stride = size_k * sizeof(half) / 16;
+  int input_row_stride = lda * sizeof(half) / 16;
+  int output_row_stride = size_k * sizeof(half) / 16;

  auto permute_row = [&](int row) {
    int iters = size_k / default_threads;
    int rest = size_k % default_threads;

-    int offset = row * row_stride;
+    int input_offset = row * input_row_stride;
+    int output_offset = row * output_row_stride;

-    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
-    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + input_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);

    int base_k = 0;

    for (int i = 0; i < iters; i++) {
-      int cur_k = base_k + threadIdx.x;
+      auto cur_k = base_k + threadIdx.x;
      int src_pos = perm_int_ptr[cur_k];

      out_half[cur_k] = a_row_half[src_pos];
@ -491,7 +494,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,

    if (rest) {
      if (threadIdx.x < rest) {
-        int cur_k = base_k + threadIdx.x;
+        auto cur_k = base_k + threadIdx.x;
        int src_pos = perm_int_ptr[cur_k];

        out_half[cur_k] = a_row_half[src_pos];
@ -537,6 +540,7 @@ __global__ void Marlin(
    int prob_m,           // batch dimension m
    int prob_n,           // output dimension n
    int prob_k,           // reduction dimension k
+    int lda,              // A.stride(0), equal to prob_k is A is contiguous
    int* locks,           // extra global storage for barrier synchronization
    bool use_atomic_add,  // whether to use atomic add to reduce
    bool use_fp32_reduce  // whether to use fp32 global reduce
@ -600,7 +604,7 @@ __global__ void Marlin(
  // We can easily implement parallel problem execution by just remapping
  // indices and advancing global pointers
  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
    locks += (slice_col_par / n_tiles) * n_tiles;
    slice_col = slice_col_par % n_tiles;
@ -631,7 +635,7 @@ __global__ void Marlin(
      }
    }
    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
+      A += 16 * thread_m_blocks * lda / 8;
      C += 16 * thread_m_blocks * prob_n / 8;
      locks += n_tiles;
      slice_col = 0;
@ -643,7 +647,7 @@ __global__ void Marlin(
  // A sizes/strides

  // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
+  int a_gl_stride = lda / 8;
  // stride of an A matrix tile in shared memory
  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
  // delta between subsequent A tiles in global memory
@ -719,8 +723,8 @@ __global__ void Marlin(
                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;

  // For act_order
  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@ -739,7 +743,7 @@ __global__ void Marlin(
                s_sh_stride * slice_col + threadIdx.x;
    }
  }
-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;

  // Zero-points
@ -752,7 +756,7 @@ __global__ void Marlin(
                 zp_sh_stride * slice_col + threadIdx.x;
    }
  }
-  int zp_sh_wr = threadIdx.x;
+  auto zp_sh_wr = threadIdx.x;
  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;

  // We use a different scale layout for grouped and column-wise quantization as
@ -1043,7 +1047,7 @@ __global__ void Marlin(
          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
        } else {
-          int warp_id = threadIdx.x / 32;
+          auto warp_id = threadIdx.x / 32;
          int n_warps = thread_n_blocks / 4;

          int warp_row = warp_id / n_warps;
@ -1081,7 +1085,7 @@ __global__ void Marlin(

    // Determine "position" inside the thread-block (based on warp and
    // thread-id)
-    int warp_id = threadIdx.x / 32;
+    auto warp_id = threadIdx.x / 32;
    int n_warps =
        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N

@ -1090,7 +1094,7 @@ __global__ void Marlin(

    cur_k += warp_row * 16;

-    int th_id = threadIdx.x % 32;
+    auto th_id = threadIdx.x % 32;
    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix

    int s_col_shift =
@ -1155,7 +1159,7 @@ __global__ void Marlin(
              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
        }
      } else {
-        int warp_id = threadIdx.x / 32;
+        auto warp_id = threadIdx.x / 32;
        int n_warps = thread_n_blocks / 4;

        int warp_row = warp_id / n_warps;
@ -1193,7 +1197,7 @@ __global__ void Marlin(
                                     (pipe / (group_blocks / thread_k_blocks)));
          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
        } else {
-          int warp_id = threadIdx.x / 32;
+          auto warp_id = threadIdx.x / 32;
          int n_warps = thread_n_blocks / 4;

          int warp_row = warp_id / n_warps;
@ -1319,7 +1323,7 @@ __global__ void Marlin(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride_threads / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride_threads;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
@ -1386,7 +1390,7 @@ __global__ void Marlin(
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

@ -1780,8 +1784,8 @@ __global__ void Marlin(
               HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
-                num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add,     \
-                use_fp32_reduce);                                              \
+                num_groups, prob_m, prob_n, prob_k, lda, locks,                \
+                use_atomic_add, use_fp32_reduce);                              \
      }                                                                        \
    }

@ -2071,7 +2075,7 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
               void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
-               int prob_n, int prob_k, void* workspace,
+               int prob_n, int prob_k, int lda, void* workspace,
               vllm::ScalarType const& q_type, bool has_act_order,
               bool is_k_full, bool has_zp, int num_groups, int group_size,
               int dev, cudaStream_t stream, int thread_k, int thread_n,
@ -2184,8 +2188,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
    // Permute A columns
    int block_rows = div_ceil(prob_m, blocks);
    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
-        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
    A_ptr = a_tmp_ptr;
+    lda = prob_k;
  }

  // If we have a full K, then we can run the non-act-order version of Marlin
@ -2244,7 +2249,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                  ", num_bits = ", num_bits);
    }

-    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    A_ptr += 16 * thread_m_blocks * (lda / 8) * par;
    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
  }
 }
@ -2300,7 +2305,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,

  // Verify device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
+  // We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
+  TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
+  TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");

  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
@ -2432,7 +2440,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
@ -2443,10 +2451,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
-        use_fp32_reduce, is_zp_float);
+        a.stride(0), workspace.data_ptr(), b_q_type, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        marlin::max_par, use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
  }
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@ -15,7 +15,7 @@ __global__ void gptq_marlin_repack_kernel(
  int n_tiles = size_n / tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);

-  int start_k_tile = blockIdx.x * block_k_tiles;
+  auto start_k_tile = blockIdx.x * block_k_tiles;
  if (start_k_tile >= k_tiles) {
    return;
  }
@ -71,8 +71,8 @@ __global__ void gptq_marlin_repack_kernel(

    if constexpr (has_perm) {
      if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;

        uint32_t const* sh_perm_int_ptr =
            reinterpret_cast<uint32_t const*>(sh_perm_ptr);
@ -88,8 +88,8 @@ __global__ void gptq_marlin_repack_kernel(

    } else {
      if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;

        int first_k = k_tile_id * tile_k_size;
        int first_k_packed = first_k / pack_factor;
@ -109,8 +109,8 @@ __global__ void gptq_marlin_repack_kernel(
      return;
    }

-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;

    if (warp_id >= 4) {
      return;
@ -339,4 +339,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
-}
+}
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@ -277,12 +277,12 @@ __global__ void Marlin(
      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;

  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                s_sh_stride * slice_col + threadIdx.x;
-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
  int s_sh_rd;
  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
@ -455,7 +455,7 @@ __global__ void Marlin(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
+      auto red_idx = threadIdx.x / b_sh_stride;
      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
@ -522,7 +522,7 @@ __global__ void Marlin(
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@ -353,10 +353,10 @@ __global__ void Marlin(
      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;

-  int s_tok_gl_rd = threadIdx.x;
+  auto s_tok_gl_rd = threadIdx.x;
  // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
  // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
  // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
@ -368,8 +368,8 @@ __global__ void Marlin(
  int s_tok_sh_rd = (threadIdx.x % 32) / 4;
  bool s_tok_sh_wr_pred = threadIdx.x < prob_m;

-  int s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
-  int s_ch_sh_wr = threadIdx.x;
+  auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+  auto s_ch_sh_wr = threadIdx.x;
  int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
                   2 * ((threadIdx.x % 32) % 4);
  bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
@ -558,7 +558,7 @@ __global__ void Marlin(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
+      auto red_idx = threadIdx.x / b_sh_stride;
      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
@ -628,7 +628,7 @@ __global__ void Marlin(
                    8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
      c_gl_wr += (4 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads * 2;
-      int c_sh_wr = 2 * threadIdx.x;
+      auto c_sh_wr = 2 * threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@ -273,15 +273,15 @@ __global__ void Marlin_24(
                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;

  int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
                (threadIdx.x % (m_sh_stride));
  m_gl_rd += (m_sh_stride)*slice_col;
  m_gl_rd += m_gl_rd_delta_o * slice_row;
-  int m_sh_wr = threadIdx.x;
-  int m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
+  auto m_sh_wr = threadIdx.x;
+  auto m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;

  int s_gl_rd;
  if constexpr (group_blocks == -1) {
@ -291,7 +291,7 @@ __global__ void Marlin_24(
              s_sh_stride * slice_col + threadIdx.x;
  }

-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
  int s_sh_rd;
  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
@ -516,7 +516,7 @@ __global__ void Marlin_24(
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride_threads / 2;
    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride_threads;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
@ -583,7 +583,7 @@ __global__ void Marlin_24(
                    8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;

      int col = 2 * ((threadIdx.x % 32) % 4);

--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -284,18 +284,18 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
  // clang-format on
  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
  const int lane4id = laneid % 4;
  const int lane16id = laneid % 16;
  const int rowid = laneid / 16;

-  const int seq_idx = blockIdx.x;
-  const int partition_idx = blockIdx.y;
+  const auto seq_idx = blockIdx.x;
+  const auto partition_idx = blockIdx.y;

  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256

-  const int max_num_partitions = gridDim.y;
+  const auto max_num_partitions = gridDim.y;

  const int context_len = context_lens[seq_idx];

@ -346,9 +346,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  // can be interpreted as B8x16 for 8 bit types
  _B16x8 Klocal[TLOOP][QKHELOOP];

-  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
-  const int wg_start_kv_head_idx = blockIdx.z;
-  const int total_num_heads = gridDim.z * GQA_RATIO;
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;
+  const auto total_num_heads = gridDim.z * GQA_RATIO;

  // for QK mfma, tokens in multiples of TOKENS_PER_WARP are spread across warps
  // each mfma takes QH16xT16x16HE across warp
@ -789,14 +789,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
  // clang-format on
  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
  const int lane4id = laneid % 4;

-  const int seq_idx = blockIdx.x;
-  const int partition_idx = blockIdx.y;
-  const int partition_size = blockDim.x;
-  const int max_num_partitions = gridDim.y;
+  const auto seq_idx = blockIdx.x;
+  const auto partition_idx = blockIdx.y;
+  const auto partition_size = blockDim.x;
+  const auto max_num_partitions = gridDim.y;

  const int context_len = context_lens[seq_idx];
  const int partition_start_token_idx = partition_idx * partition_size;
@ -838,8 +838,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    qk_max[h] = -FLT_MAX;
  }

-  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
-  const int wg_start_kv_head_idx = blockIdx.z;
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;

  const int warp_start_token_idx =
      partition_start_token_idx + warpid * WARP_SIZE;
@ -857,7 +857,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(

    const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
    // token id within partition
-    const int local_token_idx = threadIdx.x;
+    const auto local_token_idx = threadIdx.x;
    // token id within sequence
    const int global_token_idx = partition_start_token_idx + local_token_idx;

@ -1126,7 +1126,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(

  __syncthreads();

-  const int num_heads = gridDim.z * GQA_RATIO;
+  const auto num_heads = gridDim.z * GQA_RATIO;
  float* max_logits_ptr =
      max_logits + seq_idx * num_heads * max_num_partitions + partition_idx;
  float* exp_sums_ptr =
@ -1268,14 +1268,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                           // max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
    const int max_num_partitions) {
-  const int num_heads = gridDim.x;
-  const int head_idx = blockIdx.x;
-  const int seq_idx = blockIdx.y;
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  [[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;

  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS
@ -1294,7 +1294,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

  #pragma unroll
    for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
      valid_partition[i] =
          (partition_no < num_partitions) ? partition_no : last_valid_partition;
    }
@ -1324,7 +1324,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    }
  #pragma unroll
    for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
      rescaled_exp_sum[i] *= (partition_no < num_partitions)
                                 ? expf(reg_max_logit[i] - max_logit)
                                 : 0.0f;
@ -1336,7 +1336,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    }
  #pragma unroll
    for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
    }

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -111,6 +111,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      ") -> ()");
  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);

+  ops.def(
+      "block_table_appends(Tensor append_row_indices, "
+      "Tensor append_row_indices_cpu, Tensor append_cumsums, "
+      "Tensor append_cumsums_cpu, Tensor append_block_ids, "
+      "Tensor append_block_ids_cpu, Tensor! block_table, int num_appends, "
+      "int total_num_append_blocks) -> ()");
+  ops.impl("block_table_appends", torch::kCUDA, &block_table_appends);
+
+  ops.def(
+      "block_table_moves(Tensor src_dst_n, Tensor src_dst_n_cpu, "
+      "Tensor! block_table, int num_moves) -> ()");
+  ops.impl("block_table_moves", torch::kCUDA, &block_table_moves);
+
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -85,6 +85,7 @@ html_static_path = ["_static"]
 html_js_files = ["custom.js"]
 html_css_files = ["custom.css"]

+myst_heading_anchors = 2
 myst_url_schemes = {
    'http': None,
    'https': None,
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@ -124,3 +124,52 @@ nsys stats report1.nsys-rep
 GUI example:

 <img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
+
+## Profiling vLLM Python Code
+
+The Python standard library includes
+[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
+code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
+Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+used to profile a section of code.
+
+### Example usage - decorator
+
+The first helper is a Python decorator that can be used to profile a function.
+If a filename is specified, the profile will be saved to that file. If no filename is
+specified, profile data will be printed to stdout.
+
+```python
+import vllm.utils
+
+@vllm.utils.cprofile("expensive_function.prof")
+def expensive_function():
+    # some expensive code
+    pass
+```
+
+### Example Usage - context manager
+
+The second helper is a context manager that can be used to profile a block of
+code. Similar to the decorator, the filename is optional.
+
+```python
+import vllm.utils
+
+def another_function():
+    # more expensive code
+    pass
+
+with vllm.utils.cprofile_context("another_function.prof"):
+    another_function()
+```
+
+### Analyzing Profile Results
+
+There are multiple tools available that can help analyze the profile results.
+One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
+
+```bash
+pip install snakeviz
+snakeviz expensive_function.prof
+```
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:

 ```Dockerfile
-FROM vllm/vllm-openai:v0.7.3
+FROM vllm/vllm-openai:v0.8.0

 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.7.3
+RUN uv pip install vllm[audio,video]==0.8.0
 ```

 :::
@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest

-RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+RUN uv pip install git+https://github.com/huggingface/transformers.git
 ```

 :::
--- a/docs/source/deployment/frameworks/lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.

 vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.

-Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
-deploying vLLM on Kubernetes using LWS.
+## Prerequisites
+
+* At least two Kubernetes nodes, each with 8 GPUs, are required.
+* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
+
+## Deploy and Serve
+
+Deploy the following yaml file `lws.yaml`
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+spec:
+  replicas: 2
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: docker.io/vllm/vllm-openai:latest
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            readinessProbe:
+              tcpSocket:
+                port: 8080
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: docker.io/vllm/vllm-openai:latest
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm   
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
+```
+
+```bash
+kubectl apply -f lws.yaml
+```
+
+Verify the status of the pods:
+
+```bash
+kubectl get pods
+```
+
+Should get an output similar to this:
+
+```bash
+NAME       READY   STATUS    RESTARTS   AGE
+vllm-0     1/1     Running   0          2s
+vllm-0-1   1/1     Running   0          2s
+vllm-1     1/1     Running   0          2s
+vllm-1-1   1/1     Running   0          2s
+```
+
+Verify that the distributed tensor-parallel inference works:
+
+```bash
+kubectl logs vllm-0 |grep -i "Loading model weights took" 
+```
+
+Should get something similar to this:
+
+```text
+INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
+(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
+```
+
+## Access ClusterIP service
+
+```bash
+# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
+kubectl port-forward svc/vllm-leader 8080:8080
+```
+
+The output should be similar to the following:
+
+```text
+Forwarding from 127.0.0.1:8080 -> 8080
+Forwarding from [::1]:8080 -> 8080
+```
+
+## Serve the model
+
+Open another terminal and send a request
+
+```text
+curl http://localhost:8080/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    "prompt": "San Francisco is a",
+    "max_tokens": 7,
+    "temperature": 0
+}'
+```
+
+The output should be similar to the following
+
+```text
+{
+  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+  "object": "text_completion",
+  "created": 1715138766,
+  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+  "choices": [
+    {
+      "index": 0,
+      "text": " top destination for foodies, with",
+      "logprobs": null,
+      "finish_reason": "length",
+      "stop_reason": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 5,
+    "total_tokens": 12,
+    "completion_tokens": 7
+  }
+}
+```
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -4,6 +4,9 @@

 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.

+* [Deployment with CPUs](#deployment-with-cpus)
+* [Deployment with GPUs](#deployment-with-gpus)
+
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [Helm](frameworks/helm.md)
 * [InftyAI/llmaz](integrations/llmaz.md)
@ -14,11 +17,107 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
 * [vllm-project/production-stack](integrations/production-stack.md)

-## Pre-requisite
+## Deployment with CPUs

-Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
+:::{note}
+The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
+:::

-## Deployment using native K8s
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: $(HF_TOKEN)
+```
+
+Next, start the vLLM server as a Kubernetes Deployment and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+    spec:
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app.kubernetes.io/name: vllm
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+EOF
+```
+
+We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+
+```console
+kubectl logs -l app.kubernetes.io/name=vllm
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
+
+## Deployment with GPUs
+
+**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).

 1. Create a PVC, Secret and Deployment for vLLM

--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we

 In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.

-**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-1.png
 :alt: Example Time 1
@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 3
 :::

-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.

 :::{image} /assets/design/v1/prefix_caching/example-time-4.png
 :alt: Example Time 4
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.

 ```console
-pip install bitsandbytes>=0.45.0
+pip install bitsandbytes>=0.45.3
 ```

 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
@ -25,7 +25,7 @@ import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## Inflight quantization: load as 4bit quantization
@ -35,7 +35,7 @@ from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## OpenAI Compatible Server
@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
 Append the following to your 4bit model arguments:

 ```console
--quantization bitsandbytes --load-format bitsandbytes
+--quantization bitsandbytes
 ```
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@ -10,10 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs,

 vLLM currently supports the following reasoning models:

-| Model Series | Parser Name | Structured Output Support |
-|--------------|-------------|------------------|
-| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` |
+| Model Series | Parser Name | Structured Output Support | Tool Calling |
+|--------------|-------------|------------------|-------------|
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |

 ## Quickstart

@ -170,10 +170,51 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content)
 print("content: ", completion.choices[0].message.content)
 ```

+## Tool Calling
+
+The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+            },
+            "required": ["location", "unit"]
+        }
+    }
+}]
+
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+    tools=tools,
+    tool_choice="auto"
+)
+
+print(response)
+tool_call = response.choices[0].message.tool_calls[0].function
+
+print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+```
+
+For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py> .
+
 ## Limitations

 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
- It is not compatible with [`tool_calling`](#tool_calling).

 ## How to support a new reasoning model

--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -30,8 +30,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="facebook/opt-6.7b",
    tensor_parallel_size=1,
-    speculative_model="facebook/opt-125m",
-    num_speculative_tokens=5,
+    speculative_config={
+        "model": "facebook/opt-125m",
+        "num_speculative_tokens": 5,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)

@ -45,10 +47,14 @@ To perform the same with an online mode launch the server:

 ```bash
 python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-    --seed 42 -tp 1 --speculative_model facebook/opt-125m \
-    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+    --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
+    --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
 ```

+:::{warning}
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+:::
+
 Then use a client:

 ```python
@ -101,9 +107,11 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="facebook/opt-6.7b",
    tensor_parallel_size=1,
-    speculative_model="[ngram]",
-    num_speculative_tokens=5,
-    ngram_prompt_lookup_max=4,
+    speculative_config={
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 4,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)

@ -131,8 +139,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
    tensor_parallel_size=4,
-    speculative_model="ibm-ai-platform/llama3-70b-accelerator",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "ibm-ai-platform/llama3-70b-accelerator",
+        "draft_tensor_parallel_size": 1,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)

@ -175,8 +185,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    tensor_parallel_size=4,
-    speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+        "draft_tensor_parallel_size": 1,
+    },
 )

 outputs = llm.generate(prompts, sampling_params)
@ -194,11 +206,10 @@ A few important things to consider when using the EAGLE based draft models:
   be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
   If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
   [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
-   and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using
-   the latest version of vLLM, please leave a comment or raise an issue.
+   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.

 2. The EAGLE based draft models need to be run without tensor parallelism
-   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   (i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
   it is possible to run the main model using tensor parallelism (see example above).

 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation.md
@ -26,4 +26,3 @@ installation/ai_accelerator
  - Google TPU
  - Intel Gaudi
  - AWS Neuron
-  - OpenVINO
--- a/docs/source/getting_started/installation/ai_accelerator.md
+++ b/docs/source/getting_started/installation/ai_accelerator.md
@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
 :::::

 ## Requirements
@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
 :::::

 ## Configure a new environment
@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} python_env_setup.inc.md
-:::
-
-::::
-
 :::::

 ## Set up using Python
@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
 :::::

 ### Build wheel from source
@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
 :::::

 ## Set up using Docker
@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
 :::::

 ### Build image from source
@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-
-::::
-
 :::::

 ## Extra information
@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you

 ::::

-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Extra information"
-:::
-
-::::
-
 :::::
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@ -1,110 +0,0 @@
-# Installation
-
-vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
-
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-
-## Requirements
-
- OS: Linux
- Instruction set architecture (ISA) requirement: at least AVX2.
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built OpenVINO wheels.
-
-### Build wheel from source
-
-First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
-
-```console
-sudo apt-get update  -y
-sudo apt-get install python3
-pip install --upgrade pip
-```
-
-Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
-
-```console
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-```
-
-Finally, install vLLM with OpenVINO backend:
-
-```console
-PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-```
-
-:::{tip}
-To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
-:::
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built OpenVINO images.
-
-### Build image from source
-
-```console
-docker build -f Dockerfile.openvino -t vllm-openvino-env .
-docker run -it --rm vllm-openvino-env
-```
-
-## Extra information
-
-## Supported features
-
-OpenVINO vLLM backend supports the following advanced vLLM features:
-
- Prefix caching (`--enable-prefix-caching`)
- Chunked prefill (`--enable-chunked-prefill`)
-
-## Performance tips
-
-### vLLM OpenVINO backend environment variables
-
- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
-
-### CPU performance tips
-
-CPU uses the following environment variables to control behavior:
-
- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
-
-To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
-
-OpenVINO best known configuration for CPU is:
-
-```console
-$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
-```
-
-### GPU performance tips
-
-GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
-
-Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
-
-OpenVINO best known configuration for GPU is:
-
-```console
-$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-## Limitations
-
- LoRA serving is not supported.
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -193,7 +193,7 @@ vLLM CPU backend supports the following vLLM features:

 ## Related runtime environment variables

- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).

--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -58,6 +58,11 @@ from vllm import LLM, SamplingParams
 ```

 The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
+:::{important}
+By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+
+However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
+:::

 ```python
 prompts = [
@ -76,7 +81,7 @@ llm = LLM(model="facebook/opt-125m")
 ```

 :::{note}
-By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
 :::

 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
@ -107,6 +112,11 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
 By default, the server uses a predefined chat template stored in the tokenizer.
 You can learn about overriding it [here](#chat-template).
 :::
+:::{important}
+By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+
+To disable this behavior, please pass `--generation-config vllm` when launching the server.
+:::

 This server can be queried in the same format as OpenAI API. For example, to list the models:

--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -2,6 +2,8 @@

 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).

+To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
+
 ## Why vLLM V1?

 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
@ -127,6 +129,9 @@ in progress.
 - **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.

+- **Multimodal Models**: V1 is almost fully compatible with V0 except that interleaved modality input is not supported yet.
+  See [here](https://github.com/orgs/vllm-project/projects/8) for the status of upcoming features and optimizations.
+
 #### Features to Be Supported

 - **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
@ -154,6 +159,9 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring

 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).

-## FAQ
+## Frequently Asked Questions

-TODO
+**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
+The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
+
+On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
--- a/docs/source/models/extensions/fastsafetensor.md
+++ b/docs/source/models/extensions/fastsafetensor.md
@ -0,0 +1,5 @@
+Loading Model weights with fastsafetensors
+===================================================================
+
+Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@ -5,4 +5,5 @@

 runai_model_streamer
 tensorizer
+fastsafetensor
 :::
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@ -46,6 +46,11 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```

+:::{important}
+By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+
+However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
+:::
 A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>

 ### `LLM.beam_search`
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:

 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
- <project:#distributed-serving> (pipeline parallel coming soon <gh-pr:12832>!)
+- <project:#distributed-serving> (requires `transformers>=4.49.0`)

 #### Remote code

@ -472,6 +472,11 @@ See [this page](#generative-models) for more information on how to use generativ
  * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
  * ✅︎
  * ✅︎
+- * `TeleFLMForCausalLM`
+  * TeleFLM
+  * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
+  * ✅︎
+  * ✅︎
 - * `XverseForCausalLM`
  * XVERSE
  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
@ -768,7 +773,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
  * ✅︎
  * ✅︎
-  *
+  * ⚠️
 - * `GLM4VForCausalLM`<sup>^</sup>
  * GLM-4V
  * T + I
@ -884,7 +889,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `PixtralForConditionalGeneration`
  * Pixtral
  * T + I<sup>+</sup>
-  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
  *
  * ✅︎
  * ✅︎
@ -951,13 +956,10 @@ V0 correctly implements the model's attention pattern:

 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
 - Will be updated in the future to support the correct behavior
- Does not support `"do_pan_and_scan": True`

 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
-For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
 :::

 :::{note}
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should

 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.

-After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:

 ```console
 vllm serve /path/to/the/model/in/the/container \
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@ -2,7 +2,12 @@

 # Engine Arguments

-Below, you can find an explanation of every engine argument for vLLM:
+Engine arguments control the behavior of the vLLM engine.
+
+- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class.
+- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`.
+
+Below, you can find an explanation of every engine argument:

 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
@ -15,7 +20,7 @@ Below, you can find an explanation of every engine argument for vLLM:

 ## Async Engine Arguments

-Below are the additional arguments related to the asynchronous engine:
+Additional arguments are available to the asynchronous engine which is used for online serving:

 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@ -97,6 +97,13 @@ llm = LLM(model="adept/fuyu-8b",
          max_num_seqs=2)
 ```

+#### Adjust cache size
+
+If you run out of CPU RAM, try the following options:
+
+- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
+
 ### Performance optimization and tuning

 You can potentially improve the performance of vLLM by finetuning various options.
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -29,6 +29,15 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```

+:::{tip}
+vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
+You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
+:::
+:::{important}
+By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+
+To disable this behavior, please pass `--generation-config vllm` when launching the server.
+:::
 ## Supported APIs

 We currently support the following OpenAI APIs:
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
        engine_args = EngineArgs(model=model,
                                 quantization=quantization,
                                 qlora_adapter_name_or_path=lora_repo,
-                                 load_format="bitsandbytes",
                                 enable_lora=True,
                                 max_lora_rank=64)
    else:
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -6,14 +6,16 @@ import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams

-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3.1
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
+#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Pixtral-12B-2409",
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
 #     "messages": [
 #       {
 #         "role": "user",
@ -44,13 +46,15 @@ from vllm.sampling_params import SamplingParams


 def run_simple_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    sampling_params = SamplingParams(max_tokens=8192)

    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
        max_model_len=4096,
        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
@ -83,7 +87,7 @@ def run_simple_demo(args: argparse.Namespace):


 def run_advanced_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    max_img_per_msg = 5
    max_tokens_per_img = 4096

@ -91,6 +95,8 @@ def run_advanced_demo(args: argparse.Namespace):
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@ -50,7 +50,9 @@ if __name__ == "__main__":
    # Create an LLM with spec decoding
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-ai-platform/llama-13b-accelerator",
+        speculative_config={
+            "model": "ibm-ai-platform/llama-13b-accelerator",
+        },
    )

    print("With speculation")
--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+from vllm import LLM, SamplingParams
+
+# vLLM does not guarantee the reproducibility of the results by default,
+# for the sake of performance. You need to do the following to achieve
+# reproducible results:
+# 1. Turn off multiprocessing to make the scheduling deterministic.
+#    NOTE(woosuk): This is not needed and will be ignored for V0.
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+# 2. Fix the global seed for reproducibility. The default seed is None, which is
+# not reproducible.
+SEED = 42
+
+# NOTE(woosuk): Even with the above two settings, vLLM only provides
+# reproducibility when it runs on the same hardware and the same vLLM version.
+# Also, the online serving API (`vllm serve`) does not support reproducibility
+# because it is almost impossible to make the scheduling deterministic in the
+# online serving setting.
+
+llm = LLM(model="facebook/opt-125m", seed=SEED)
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -169,7 +169,6 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
        mm_processor_kwargs={"do_pan_and_scan": True},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
@ -362,6 +361,7 @@ def run_llava_next_video(questions: list[str],
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
+        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -91,8 +91,6 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
-        mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={"image": len(image_urls)},
    )

--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@ -8,6 +8,9 @@ set -xe
 echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
 sleep 1

+# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'cleanup' INT

@ -44,18 +47,20 @@ wait_for_server() {
 # You can also adjust --kv-ip and --kv-port for distributed inference.

 # prefilling instance, which is the KV producer
-CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
    --port 8100 \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &

 # decoding instance, which is the KV consumer
-CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
    --port 8200 \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &

@ -78,7 +83,7 @@ sleep 1
 output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "San Francisco is a",
 "max_tokens": 10,
 "temperature": 0
@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 output2=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "Santa Clara is a",
 "max_tokens": 10,
 "temperature": 0
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+An example demonstrates how to use tool calling with reasoning models 
+like QwQ-32B. The reasoning_content will not be parsed by the tool 
+calling process; only the final output will be parsed.
+
+To run this example, you need to start the vLLM server with both 
+the reasoning parser and tool calling enabled.
+
+```bash
+vllm serve Qwen/QwQ-32B \
+     --enable-reasoning --reasoning-parser deepseek_r1 \
+     --enable-auto-tool-choice --tool-call-parser hermes
+     
+```
+
+"""
+
+from openai import OpenAI
+
+
+# Now, simulate a tool call
+def get_current_weather(city: str, state: str, unit: 'str'):
+    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+            "partly cloudly, with highs in the 90's.")
+
+
+available_tools = {"get_current_weather": get_current_weather}
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+messages = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+print("---------Full Generate With Automatic Function Calling-------------")
+tool_calls = client.chat.completions.create(messages=messages,
+                                            model=model,
+                                            tools=tools)
+print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+print(f"function name: "
+      f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
+print(f"function arguments: "
+      f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
+
+print("----------Stream Generate With Automatic Function Calling-----------")
+tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                   model=model,
+                                                   tools=tools,
+                                                   stream=True)
+chunks = []
+for chunk in tool_calls_stream:
+    chunks.append(chunk)
+
+reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    chunks)
+
+print(f"reasoning_content: {reasoning_content}")
+print(f"function name: {function_names[0]}")
+print(f"function arguments: {arguments[0]}")
+
+print("----------Full Generate With Named Function Calling-----------------")
+tool_calls = client.chat.completions.create(messages=messages,
+                                            model=model,
+                                            tools=tools,
+                                            tool_choice={
+                                                "type": "function",
+                                                "function": {
+                                                    "name":
+                                                    "get_current_weather"
+                                                }
+                                            })
+
+tool_call = tool_calls.choices[0].message.tool_calls[0].function
+print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+print(f"function name: {tool_call.name}")
+print(f"function arguments: {tool_call.arguments}")
+print("----------Stream Generate With Named Function Calling--------------")
+
+tool_calls_stream = client.chat.completions.create(
+    messages=messages,
+    model=model,
+    tools=tools,
+    tool_choice={
+        "type": "function",
+        "function": {
+            "name": "get_current_weather"
+        }
+    },
+    stream=True)
+
+chunks = []
+for chunk in tool_calls_stream:
+    chunks.append(chunk)
+
+reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    chunks)
+print(f"reasoning_content: {reasoning_content}")
+print(f"function name: {function_names[0]}")
+print(f"function arguments: {arguments[0]}")
+print("\n\n")
--- a/examples/template_teleflm.jinja
+++ b/examples/template_teleflm.jinja
@ -0,0 +1,12 @@
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '<_user>' + message['content']|trim }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<_system>' + message['content']|trim }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '<_bot>' + message['content'] }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<_bot>' }}
+{%- endif %}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -86,6 +86,7 @@ exclude = [
 "vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
+"vllm/utils.py" = ["UP006", "UP035"]

 [tool.ruff.lint]
 select = [
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -18,6 +18,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
+llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -3,14 +3,15 @@

 # Dependencies for CPUs
 torch==2.6.0+cpu; platform_machine == "x86_64"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.6.0; platform_system == "Darwin"
+torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.5.1; platform_machine == "ppc64le"
+torchaudio==2.6.0; platform_machine == "ppc64le"

 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
-torchvision==0.20.1; platform_machine == "ppc64le"
+torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
--- a/requirements/openvino.txt
+++ b/requirements/openvino.txt
@ -1,8 +0,0 @@
-# Common dependencies
-r common.txt
-
-torch == 2.5.1 #  should be aligned with "common" vLLM torch version
-openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
-
-optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.2
-torch==2.5.1
-torchvision==0.20.1
-torchaudio==2.5.1
+--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0

 cmake>=3.26
 packaging
--- a/requirements/test.in
+++ b/requirements/test.in
@ -30,7 +30,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-transformers==4.48.2 
+transformers==4.48.2
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
@ -41,3 +41,4 @@ tritonclient==2.51.0
 numpy < 2.0.0
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
+fastsafetensors>=0.1.10
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -67,6 +67,7 @@ click==8.1.7
    #   jiwer
    #   nltk
    #   ray
+    #   typer
 colorama==0.4.6
    # via
    #   awscli
@ -122,6 +123,8 @@ fastparquet==2024.11.0
    # via genai-perf
 fastrlock==0.8.2
    # via cupy-cuda12x
+fastsafetensors==0.1.10
+    # via -r requirements/test.in
 filelock==3.16.1
    # via
    #   datasets
@ -505,7 +508,9 @@ requests==2.32.3
 responses==0.25.3
    # via genai-perf
 rich==13.9.4
-    # via genai-perf
+    # via
+    #   genai-perf
+    #   typer
 rouge-score==0.1.2
    # via lm-eval
 rpds-py==0.20.1
@ -550,6 +555,8 @@ setuptools==75.8.0
    # via
    #   pytablewriter
    #   torch
+shellingham==1.5.4
+    # via typer
 six==1.16.0
    # via
    #   python-dateutil
@ -600,6 +607,7 @@ torch==2.6.0
    #   accelerate
    #   bitsandbytes
    #   encodec
+    #   fastsafetensors
    #   lm-eval
    #   peft
    #   runai-model-streamer
@ -654,6 +662,8 @@ typepy==1.3.2
    #   dataproperty
    #   pytablewriter
    #   tabledata
+typer==0.15.2
+    # via fastsafetensors
 typing-extensions==4.12.2
    # via
    #   huggingface-hub
@ -663,6 +673,7 @@ typing-extensions==4.12.2
    #   pydantic
    #   pydantic-core
    #   torch
+    #   typer
 tzdata==2024.2
    # via pandas
 urllib3==2.2.3
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
--- a/setup.py
+++ b/setup.py
@ -449,10 +449,6 @@ def _is_cpu() -> bool:
    return VLLM_TARGET_DEVICE == "cpu"


-def _is_openvino() -> bool:
-    return VLLM_TARGET_DEVICE == "openvino"
-
-
 def _is_xpu() -> bool:
    return VLLM_TARGET_DEVICE == "xpu"

@ -572,8 +568,6 @@ def get_vllm_version() -> str:
        if gaudi_sw_version != MAIN_CUDA_VERSION:
            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
            version += f"{sep}gaudi{gaudi_sw_version}"
-    elif _is_openvino():
-        version += f"{sep}openvino"
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@ -623,8 +617,6 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("neuron.txt")
    elif _is_hpu():
        requirements = _read_requirements("hpu.txt")
-    elif _is_openvino():
-        requirements = _read_requirements("openvino.txt")
    elif _is_tpu():
        requirements = _read_requirements("tpu.txt")
    elif _is_cpu():
@ -634,7 +626,7 @@ def get_requirements() -> list[str]:
    else:
        raise ValueError(
            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
-            "OpenVINO, or CPU.")
+            "or CPU.")
    return requirements


@ -688,6 +680,7 @@ setup(
    install_requires=get_requirements(),
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
+        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
        "audio": ["librosa", "soundfile"],  # Required for audio processing
        "video": ["decord"]  # Required for video processing
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+import Cython.Compiler.Options
+from Cython.Build import cythonize
+from setuptools import setup
+
+Cython.Compiler.Options.annotate = True
+
+infiles = []
+
+infiles += [
+    "vllm/engine/llm_engine.py",
+    "vllm/transformers_utils/detokenizer.py",
+    "vllm/engine/output_processor/single_step.py",
+    "vllm/outputs.py",
+    "vllm/engine/output_processor/stop_checker.py",
+]
+
+infiles += [
+    "vllm/core/scheduler.py",
+    "vllm/sequence.py",
+    "vllm/core/block_manager.py",
+]
+
+infiles += [
+    "vllm/model_executor/layers/sampler.py",
+    "vllm/sampling_params.py",
+    "vllm/utils.py",
+]
+
+setup(ext_modules=cythonize(infiles,
+                            annotate=False,
+                            force=True,
+                            compiler_directives={
+                                'language_level': "3",
+                                'infer_types': True
+                            }))
+
+# example usage: python3 build_cython.py build_ext --inplace
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -60,7 +60,7 @@ class TestSetting:
        # embedding model
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
-            model_args=["--task", "embed"],
+            model_args=["--task", "embed", "--dtype", "bfloat16"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@ -1,37 +1,71 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import pickle
+import copy

 import pytest
 import torch
-from torch._inductor.codecache import BypassFxGraphCache

-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.inductor_pass import (CallableInductorPass,
-                                            as_inductor_pass)
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
+from vllm.config import CompilationConfig


+# dummy custom pass that doesn't inherit
 def simple_callable(graph: torch.fx.Graph):
    pass


-@as_inductor_pass(files=(__file__, ))
-def callable_decorated(graph: torch.fx.Graph):
-    pass
+# Should fail to add directly to the pass manager
+def test_bad_callable():
+    config = CompilationConfig().pass_config
+
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    with pytest.raises(AssertionError):
+        pass_manager.add(simple_callable)  # noqa, type wrong on purpose
+
+
+# Pass that inherits from InductorPass
+class ProperPass(InductorPass):
+
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        pass


@pytest.mark.parametrize(
-    "works, callable",
-    [(False, simple_callable), (True, callable_decorated),
-     (True, CallableInductorPass(simple_callable, "simple_callable"))])
-def test_pass_manager(works: bool, callable):
+    "callable",
+    [
+        ProperPass(),
+        # Can also wrap callables in CallableInductorPass for compliance
+        CallableInductorPass(simple_callable),
+        CallableInductorPass(simple_callable,
+                             InductorPass.hash_source(__file__))
+    ],
+)
+def test_pass_manager_uuid(callable):
    config = CompilationConfig().pass_config
-    pass_manager = PostGradPassManager([callable])
-    pass_manager.configure(config)  # Adds default passes

-    if works:
-        pickle.dumps(pass_manager)
-    else:
-        with pytest.raises(BypassFxGraphCache):
-            pickle.dumps(pass_manager)
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    # Check that UUID is different if the same pass is added 2x
+    pass_manager.add(callable)
+    uuid1 = pass_manager.uuid()
+    pass_manager.add(callable)
+    uuid2 = pass_manager.uuid()
+    assert uuid1 != uuid2
+
+    # UUID should be the same as the original one,
+    # as we constructed in the same way.
+    pass_manager2 = PostGradPassManager()
+    pass_manager2.configure(config)
+    pass_manager2.add(callable)
+    assert uuid1 == pass_manager2.uuid()
+
+    # UUID should be different due to config change
+    config2 = copy.deepcopy(config)
+    config2.enable_fusion = not config2.enable_fusion
+    pass_manager3 = PostGradPassManager()
+    pass_manager3.configure(config2)
+    pass_manager3.add(callable)
+    assert uuid1 != pass_manager3.uuid()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -14,8 +14,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          BatchFeature)
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from tests.models.utils import (TokensTextLogprobs,
@ -23,7 +23,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@ -34,8 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_list_of)
+from vllm.utils import cuda_device_count_stateless, is_list_of

 logger = init_logger(__name__)

@ -271,14 +270,17 @@ _R = TypeVar("_R")

 class HfRunner:

-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def get_default_device(self):
        from vllm.platforms import current_platform
+
+        return ("cpu" if current_platform.is_cpu() else "cuda")
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
        if x is None or isinstance(x, (bool, )):
            return x

        if device is None:
-            device = "cpu" if current_platform.is_cpu(
-            ) or current_platform.is_openvino() else "cuda"
+            device = self.device

        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
@ -291,45 +293,59 @@ class HfRunner:
    def __init__(
        self,
        model_name: str,
-        dtype: str = "half",
+        dtype: str = "auto",
        *,
        model_kwargs: Optional[dict[str, Any]] = None,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[..., BatchEncoding] = identity,
    ) -> None:
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
        self.model_name = model_name

+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
        if is_sentence_transformer:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(
-                SentenceTransformer(
-                    model_name,
-                    device="cpu",
-                    trust_remote_code=True,
-                ).to(dtype=torch_dtype))
+
+            self.model = SentenceTransformer(
+                model_name,
+                device=self.device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=True,
+            )
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
-            self.model = CrossEncoder(model_name,
-                                      device="cpu",
-                                      trust_remote_code=True)
-            self.model.model = self.wrap_device(self.model.model)\
-                .to(dtype=torch_dtype)
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=True,
+            )
        else:
-            model_kwargs = model_kwargs if model_kwargs is not None else {}
-            self.model = self.wrap_device(
-                auto_cls.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=True,
-                    **model_kwargs,
-                ))
+            model = auto_cls.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                **model_kwargs,
+            )
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(self.device)
+
+            self.model = model

        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
@ -349,16 +365,13 @@ class HfRunner:
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer

-        self.dtype = dtype
-        self.postprocess_inputs = postprocess_inputs
-
    def get_inputs(
        self,
        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> list[BatchEncoding]:
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
        if images is not None:
            assert len(prompts) == len(images)

@ -368,7 +381,7 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)

-        all_inputs: list[BatchEncoding] = []
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: dict[str, Any] = {
                "text": prompt,
@ -384,7 +397,8 @@ class HfRunner:
                processor_kwargs["sampling_rate"] = sr

            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)

            all_inputs.append(inputs)

@ -417,7 +431,7 @@ class HfRunner:
        outputs: list[tuple[list[list[int]], list[str]]] = []
        for inputs in all_inputs:
            output_ids = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                **kwargs,
            )
@ -488,7 +502,7 @@ class HfRunner:
        all_logprobs: list[list[torch.Tensor]] = []
        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -569,7 +583,7 @@ class HfRunner:

        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -620,19 +634,15 @@ class HfRunner:
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]

-            encoder_inputs = self.wrap_device(
-                self.processor(**processor_kwargs),
-                device=self.model.device.type,
-            )
+            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(encoder_inputs)

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
-                decoder_input_ids = self.wrap_device(
-                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids,
-                    device=self.model.device.type,
-                )
+                decoder_inputs = self.tokenizer(decoder_prompt,
+                                                return_tensors="pt")
+                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
@ -684,6 +694,7 @@ class VllmRunner:
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
+
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
@ -701,10 +712,8 @@ class VllmRunner:
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
-        # Use smaller max model length, otherwise bigger model cannot run due
-        # to kv cache size limit.
        max_model_len: int = 1024,
-        dtype: str = "half",
+        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
@ -1110,4 +1119,4 @@ def pytest_collection_modifyitems(config, items):
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
-            item.add_marker(skip_optional)
+            item.add_marker(skip_optional)
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -175,6 +175,8 @@ TEXT_GENERATION_MODELS = {
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
+    # Tests TransformersModel
+    "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
    # Uses Llama
@ -243,6 +245,7 @@ TEST_MODELS = [
    # [LANGUAGE GENERATION]
    "microsoft/Phi-3.5-MoE-instruct",
    "meta-llama/Llama-3.2-1B-Instruct",
+    # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
    "ibm/PowerLM-3b",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@ -9,6 +9,8 @@ import torch.distributed as dist
 from vllm import LLM, SamplingParams
 from vllm.distributed.parallel_state import get_world_group

+dist.init_process_group(backend="gloo")
+
 # Create prompts
 prompts = [
    "Hello, my name is",
--- a/Show More
+++ b/Show More