[V1] [Spec Decode] Support random sampling for spec decode (#13933 )

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
[Benchmark] Do not save detailed info to json by default (#14879 )
2025-10-20 23:03:52 +08:00 · 2025-03-16 22:00:20 -07:00 · 2025-03-16 21:48:11 -07:00 · 2025-03-16 20:35:57 -07:00 · 2025-03-17 03:35:37 +00:00 · 2025-03-16 20:20:15 -07:00
405 changed files with 21008 additions and 6287 deletions
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.233
+    value: 0.231
  - name: "exact_match,flexible-extract"
-    value: 0.236
+    value: 0.22
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -13,6 +13,7 @@ from pathlib import Path

 import lm_eval
 import numpy
+import pytest
 import yaml

 RTOL = 0.05
@ -46,6 +47,10 @@ def test_lm_eval_correctness():
    eval_config = yaml.safe_load(
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))

+    if eval_config[
+            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
+        pytest.skip("FBGEMM is currently failing on main.")
+
    # Launch eval requests.
    results = launch_lm_eval(eval_config)

--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -57,8 +57,6 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "rm -f /var/log/syslog"
-      - "rm -f /var/log/kern.log"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -101,16 +101,30 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_permute_cols.py"
 fi

-#ignore certain Entrypoints tests
+#ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
-  --ignore=entrypoints/openai/test_accuracy.py \
  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_encoder_decoder.py \
-  --ignore=entrypoints/openai/test_embedding.py \
-  --ignore=entrypoints/openai/test_oot_registration.py "}
+  --ignore=entrypoints/openai/test_chat.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_sleep.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi

+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -44,11 +44,11 @@ remove_docker_container() {
 trap remove_docker_container EXIT

 # Run the image
-docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+docker run --rm -it --device=/dev/neuron0 --network bridge \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -4,16 +4,27 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex

+image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
 # Try building the docker image
-docker build -t xpu-test -f Dockerfile.xpu .
+docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f xpu-test || true; }
+remove_docker_container() { 
+  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and test offline inference/tensor parallel
-docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+docker run \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    --entrypoint="" \
+    --name "${container_name}" \
+    "${image_name}" \
+    sh -c '
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,7 +41,6 @@ steps:
  - grep \"sig sig-object py\" build/html/api/inference_params.html

 - label: Async Engine, Inputs, Utils, Worker Test # 24min
-  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@ -118,15 +117,14 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
-  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
@ -138,7 +136,7 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  commands:
-  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
+  - python3 ../examples/offline_inference/data_parallel.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@ -152,7 +150,6 @@ steps:

 - label: Metrics, Tracing Test # 10min
  num_gpus: 2
-  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
@ -200,16 +197,18 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
-    - VLLM_USE_V1=1 pytest -v -s v1/core
-    - VLLM_USE_V1=1 pytest -v -s v1/engine
-    - VLLM_USE_V1=1 pytest -v -s v1/sample
-    - VLLM_USE_V1=1 pytest -v -s v1/worker
-    - VLLM_USE_V1=1 pytest -v -s v1/structured_output
-    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
-    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/core
+    - pytest -v -s v1/entrypoints
+    - pytest -v -s v1/engine
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_stats.py
+    - pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/test_oracle.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - VLLM_USE_V1=1 pytest -v -s v1/e2e
+    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -229,12 +228,12 @@ steps:
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/vision_language.py
    - python3 offline_inference/vision_language_multi_image.py
-    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]
@ -284,7 +283,6 @@ steps:
  parallelism: 4

 - label: PyTorch Fullgraph Smoke Test # 9min
-  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -379,7 +377,8 @@ steps:
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_initialization.py
+    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py

 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -522,13 +521,12 @@ steps:
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py

 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
-  fast_check: true
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -53,7 +53,7 @@ repos:
    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
    stages: [pre-commit] # Don't run in CI
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")

 #
 # Try to find python package with an executable that exactly matches
@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # Only build AllSpark kernels if we are building for at least some compatible archs.
  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
+  if (ALLSPARK_ARCHS)
    set(ALLSPARK_SRCS
       "csrc/quantization/gptq_allspark/allspark_repack.cu"
       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
  else()
    message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures, or CUDA not >= 12.0")
+                   " in CUDA target architectures")
  endif()


--- a/2
+++ b/2
@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist

 RUN --mount=type=cache,target=/root/.cache/uv \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples

--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -61,6 +61,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    cd /install \
    && pip install -U -r requirements/rocm.txt \
+    && pip install -U -r requirements/rocm-test.txt \
    && pip uninstall -y vllm \
    && pip install *.whl

--- a/README.md
+++ b/README.md
@ -25,6 +25,7 @@ Join us to connect with the **vLLM team** and explore how vLLM is leveraged in *

 *Latest News* 🔥

+- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -43,20 +43,26 @@ become available.
    <tr>
      <td><strong>HuggingFace</strong></td>
      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">🚧</td>
+      <td style="text-align: center;">🟡</td>
      <td>Specify your dataset path on HuggingFace</td>
    </tr>
    <tr>
      <td><strong>VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">🚧</td>
+      <td style="text-align: center;">✅</td>
      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
    </tr>
  </tbody>
 </table>
+
 ✅: supported
+
 🚧: to be supported

+🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
+formats, please consider contributing.
+
 **Note**: VisionArena’s `dataset-name` should be set to `hf`

 ---
@ -76,10 +82,10 @@ Then run the benchmarking script
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
-BACKEND="openai-chat"
+BACKEND="vllm"
 DATASET_NAME="sharegpt"
 DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```

 If successful, you will see the following output
@ -123,7 +129,7 @@ DATASET_NAME="hf"
 DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
 DATASET_SPLIT='train'

-python3 benchmarks/benchmark_serving.py \
+python3 vllm/benchmarks/benchmark_serving.py \
  --backend "${BACKEND}" \
  --model "${MODEL_NAME}" \
  --endpoint "/v1/chat/completions" \
@ -140,35 +146,65 @@ python3 benchmarks/benchmark_serving.py \
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
 DATASET_NAME="sonnet"
-DATASET_PATH="benchmarks/sonnet.txt"
+DATASET_PATH="vllm/benchmarks/sonnet.txt"

-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model "${MODEL_NAME}" \
  --dataset-name "${DATASET_NAME}" \
  --dataset-path "${DATASET_PATH}" \
  --num-prompts "${NUM_PROMPTS}"
-  ```
+```

 If successful, you will see the following output

 ```
-Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
+Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Total num prompt tokens:  5014
+Total num output tokens:  1500
+```
+
+### VisionArena Benchmark for Vision Language Models
+
+``` bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT="train"
+
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model "${MODEL_NAME}" \
+  --backend "vllm-chat" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-split "${DATASET_SPLIT}"
+```
+
+The `num prompt tokens` now includes image token counts
+
+```
+Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
+Total num prompt tokens:  14527
+Total num output tokens:  1280
 ```

 ### Benchmark with LoRA Adapters

 ``` bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="meta-llama/Llama-2-7b-hf"
 BACKEND="vllm"
 DATASET_NAME="sharegpt"
-DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
 NUM_PROMPTS=10
 MAX_LORAS=2
 MAX_LORA_RANK=8
 ENABLE_LORA="--enable-lora"
 LORA_PATH="yard1/llama-2-7b-sql-lora-test"

-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model "${MODEL_NAME}" \
  --backend "${BACKEND}" \
  --dataset_path "${DATASET_PATH}" \
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

-from vllm.model_executor.model_loader.weight_utils import get_lock
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.

 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)

@ -333,7 +334,7 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "chat/completions"
+        ("chat/completions", "profile")
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(trust_env=True,
@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download

+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
        # Use file lock to prevent multiple processes from
        # downloading the same model weights at the same time.
        with get_lock(pretrained_model_name_or_path):
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -46,7 +46,7 @@ class SampleRequest:
    Represents a single inference request for benchmarking.
    """

-    prompt: str
+    prompt: Union[str, Any]
    prompt_len: int
    expected_output_len: int
    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@ -84,6 +84,20 @@ class BenchmarkDataset(ABC):
                            if random_seed is not None else self.DEFAULT_SEED)
        self.data = None

+    def apply_multimodal_chat_transformation(
+            self,
+            prompt: str,
+            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific 
+        conversation format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.
@ -338,6 +352,7 @@ class ShareGPTDataset(BenchmarkDataset):
               lora_path: Optional[str] = None,
               max_loras: Optional[int] = None,
               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        samples: list = []
        for entry in self.data:
@ -358,6 +373,9 @@ class ShareGPTDataset(BenchmarkDataset):
                                     skip_min_output_len_check=output_len
                                     is not None):
                continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
@ -550,10 +568,13 @@ class HuggingFaceDataset(BenchmarkDataset):
            split=self.dataset_split,
            streaming=True,
        )
-
-        if "conversations" not in self.data.features:
-            raise ValueError("HF Dataset must have a 'conversations' column.")
-
+        if self.data.features is None or "conversations" \
+            not in self.data.features:
+            raise ValueError(
+                "HuggingFaceDataset currently only supports datasets with "
+                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
+                "Please consider contributing if you would like to add "
+                "support for additional dataset formats.")
        # Shuffle and filter examples with at least 2 conversations.
        self.data = self.data.shuffle(seed=self.random_seed).filter(
            lambda x: len(x["conversations"]) >= 2)
@ -561,9 +582,8 @@ class HuggingFaceDataset(BenchmarkDataset):
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
-               lora_path: Optional[str] = None,
-               max_loras: Optional[int] = None,
               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        sampled_requests = []
        dynamic_output = output_len is None
@ -571,13 +591,9 @@ class HuggingFaceDataset(BenchmarkDataset):
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-
            conv = item["conversations"]
            prompt, completion = conv[0]["value"], conv[1]["value"]

-            lora_request, tokenizer = self.get_random_lora_request(
-                tokenizer, lora_path=lora_path, max_loras=max_loras)
-
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
@ -587,16 +603,20 @@ class HuggingFaceDataset(BenchmarkDataset):
            if dynamic_output and not is_valid_sequence(
                    prompt_len, completion_len):
                continue
-
            mm_content = process_image(
                item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                    lora_request=lora_request,
                ))
        return sampled_requests

@ -606,7 +626,7 @@ class HuggingFaceDataset(BenchmarkDataset):
 # -----------------------------------------------------------------------------


-class VisionArenaDataset(BenchmarkDataset):
+class VisionArenaDataset(HuggingFaceDataset):
    """
    Vision Arena Dataset.
    """
@ -617,14 +637,9 @@ class VisionArenaDataset(BenchmarkDataset):

    def __init__(
        self,
-        dataset_split: str,
-        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
-        self.dataset_split = dataset_split
-        self.dataset_subset = dataset_subset
-
        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
            raise ValueError(f"Only support Vision Arena dataset.\
                    This data path {self.dataset_path} is not valid.")
@ -645,9 +660,9 @@ class VisionArenaDataset(BenchmarkDataset):
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
-               output_len: int = DEFAULT_OUTPUT_LEN,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
-        # TODO (jenniferzhao): Add support for offline benchmark sampling
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
@ -655,8 +670,14 @@ class VisionArenaDataset(BenchmarkDataset):
            if len(sampled_requests) >= num_requests:
                break
            prompt = item["turns"][0][0]["content"]
-            prompt_len = len(tokenizer(prompt).input_ids)
            mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -684,6 +684,15 @@ def main(args: argparse.Namespace):
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )

+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                    "input_lens", "output_lens", "ttfts", "itls",
+                    "generated_texts", "errors"
+            ]:
+                if field in result_json:
+                    del result_json[field]
+
        # Traffic
        result_json["request_rate"] = (args.request_rate if args.request_rate
                                       < float("inf") else "inf")
@ -828,6 +837,12 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -11,8 +11,9 @@ from typing import Any, Optional, Union

 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
-                               ShareGPTDataset, SonnetDataset)
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@ -23,6 +24,7 @@ from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators

@ -32,7 +34,7 @@ def run_vllm(
    n: int,
    engine_args: EngineArgs,
    disable_detokenize: bool = False,
-) -> float:
+) -> tuple[float, Optional[list[RequestOutput]]]:
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
@ -66,9 +68,10 @@ def run_vllm(

    use_beam_search = False

+    outputs = None
    if not use_beam_search:
        start = time.perf_counter()
-        llm.generate(prompts,
+        outputs = llm.generate(prompts,
                               sampling_params,
                               lora_request=lora_requests,
                               use_tqdm=True)
@ -89,7 +92,46 @@ def run_vllm(
                ignore_eos=True,
            ))
        end = time.perf_counter()
-    return end - start
+    return end - start, outputs
+
+
+def run_vllm_chat(
+        requests: list[SampleRequest],
+        n: int,
+        engine_args: EngineArgs,
+        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of "
+            "prompt_len and expected_output_len for all requests.")
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs


 async def run_vllm_async(
@ -264,6 +306,8 @@ def get_requests(args, tokenizer):
        dataset_cls = RandomDataset
    elif args.dataset_name == "sharegpt":
        dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
    elif args.dataset_name == "sonnet":
        assert tokenizer.chat_template or tokenizer.default_chat_template, (
            "Tokenizer/model must have chat template for sonnet dataset.")
@ -272,6 +316,19 @@ def get_requests(args, tokenizer):
        sample_kwargs["return_prompt_formatted"] = True
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        if args.backend != "vllm-chat":
+            raise ValueError(
+                "hf datasets only are supported by vllm-chat backend")
+        # Choose between VisionArenaDataset and HuggingFaceDataset based on
+        # provided parameters.
+        dataset_cls = (VisionArenaDataset if args.dataset_path
+                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                       and args.hf_subset is None else HuggingFaceDataset)
+        common_kwargs['dataset_subset'] = args.hf_subset
+        common_kwargs['dataset_split'] = args.hf_split
+        sample_kwargs["enable_multimodal_chat"] = True
+
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -290,6 +347,7 @@ def main(args: argparse.Namespace):
    requests = get_requests(args, tokenizer)
    is_multi_modal = any(request.multi_modal_data is not None
                         for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
    if args.backend == "vllm":
        if args.async_engine:
            elapsed_time = uvloop.run(
@ -301,8 +359,8 @@ def main(args: argparse.Namespace):
                    args.disable_detokenize,
                ))
        else:
-            elapsed_time = run_vllm(requests, args.n,
-                                    EngineArgs.from_cli_args(args),
+            elapsed_time, request_outputs = run_vllm(
+                requests, args.n, EngineArgs.from_cli_args(args),
                args.disable_detokenize)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
@ -312,20 +370,45 @@ def main(args: argparse.Namespace):
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args),
+            args.disable_detokenize)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(request.prompt_len + request.expected_output_len
-                           for request in requests)
-    total_output_tokens = sum(request.expected_output_len
-                              for request in requests)
-    if is_multi_modal:
-        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += len(
+                ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            total_output_tokens += sum(
+                len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len
+                               for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print("\033[91mWARNING\033[0m: Multi-modal request with "
+              f"{args.backend} backend detected. The "
              "following metrics are not accurate because image tokens are not"
              " counted. See vllm-project/vllm/issues/9778 for details.")
        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")

    # Output JSON results if specified
    if args.output_json:
@ -341,15 +424,98 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, results)


+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print(
+            "When dataset path is not set, it will default to random dataset")
+        args.dataset_name = 'random'
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+            getattr(args, "hf_subset", None) is not None
+            or getattr(args, "hf_split", None) is not None):
+        warnings.warn("--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+                      stacklevel=2)
+    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
+        raise ValueError(
+            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != 'random' and args.random_range_ratio is not None:
+        warnings.warn("--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+                      stacklevel=2)
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if args.dataset_name not in {"random", "sonnet", None
+                                 } and args.prefix_len is not None:
+        warnings.warn("--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+                      stacklevel=2)
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError(
+            "LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
+                                                 None) is not None:
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError(
+            "Tokenizer must be the same as the model for MII backend.")
+
+
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
-                        choices=["vllm", "hf", "mii"],
+                        choices=["vllm", "hf", "mii", "vllm-chat"],
                        default="vllm")
-    parser.add_argument("--dataset-name",
+    parser.add_argument(
+        "--dataset-name",
        type=str,
-                        choices=["sharegpt", "random", "sonnet", "burstgpt"],
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
        help="Name of the dataset to benchmark on.",
        default="sharegpt")
    parser.add_argument(
@ -419,55 +585,24 @@ if __name__ == "__main__":
    parser.add_argument(
        "--random-range-ratio",
        type=float,
-        default=1.0,
+        default=None,
        help="Range of sampled ratio of input/output length, "
        "used only for RandomDataSet.",
    )

+    # hf dtaset
+    parser.add_argument("--hf-subset",
+                        type=str,
+                        default=None,
+                        help="Subset of the HF dataset.")
+    parser.add_argument("--hf-split",
+                        type=str,
+                        default=None,
+                        help="Split of the HF dataset.")
+
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
-    if args.dataset is not None:
-        warnings.warn(
-            "The '--dataset' argument will be deprecated in the next "
-            "release. Please use '--dataset-name' and "
-            "'--dataset-path' in the future runs.",
-            stacklevel=2)
-        args.dataset_path = args.dataset
-    if args.dataset is None and args.dataset_path is None:
-        # for random dataset, the default sampling setting is in
-        # benchmark_dataset.RandomDataset
-        print("When dataset is not set, it will default to random dataset")
-    else:
-        assert args.input_len is None
-    if args.enable_lora:
-        assert args.lora_path is not None
-
-    if args.backend == "vllm":
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-    elif args.backend == "hf":
-        if args.hf_max_batch_size is None:
-            raise ValueError("HF max batch size is required for HF backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.enable_lora is not None:
-            raise ValueError("LoRA benchmarking is only supported for vLLM"
-                             " backend")
-    elif args.backend == "mii":
-        if args.dtype != "auto":
-            raise ValueError("dtype must be auto for MII backend.")
-        if args.n != 1:
-            raise ValueError("n must be 1 for MII backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-        if args.tokenizer != args.model:
-            raise ValueError("Tokenizer must be the same as the model for MII "
-                             "backend.")
-        if args.enable_lora is not None:
-            raise ValueError("LoRA benchmarking is only supported for vLLM"
-                             " backend")
+    validate_args(args)
    main(args)
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -3,6 +3,12 @@

 #include "cpu_types.hpp"

+#if defined(__x86_64__)
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
+#else
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
+#endif
+
 namespace {
 template <typename scalar_t>
 void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
@ -95,8 +101,7 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
  }

  const int element_num_per_block = key_caches[0][0].numel();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
                                   element_num_per_block, num_layers);
@ -118,14 +123,13 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);

-  VLLM_DISPATCH_FLOATING_TYPES(
-      key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
+  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
    reshape_and_cache_cpu_impl<scalar_t>(
        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-            slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-            value_stride, num_heads, head_size, block_size, x);
+        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
+        num_heads, head_size, block_size, x);
    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
  });
 }
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -16,9 +16,18 @@ namespace vec_op {
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)

+#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
+
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

+#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                                \
+                     VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
+
 #ifndef CPU_OP_GUARD
  #define CPU_KERNEL_GUARD_IN(NAME)
  #define CPU_KERNEL_GUARD_OUT(NAME)
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl(
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      torch::Tensor& key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
-  int num_tokens = query.numel() / query.size(-1);
+  int num_tokens = positions.numel();
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(-1) / head_size;
  int num_kv_heads = key.size(-1) / head_size;
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@ -274,7 +274,7 @@ void advance_step_flashinfer(
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);

-  int block_tables_stride = block_tables.stride(0);
+  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
  TORCH_CHECK((blocks * threads > num_queries),
              "multi-step: not enough threads to map to num_queries = ",
              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@ -19,12 +19,24 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) {
  return {};
 }

+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
 template <>
 __device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+    #if HIP_FP8_TYPE_OCP
  return c10::Float8_e4m3fn(
      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
                             __hip_fp8_e4m3::__default_interpret),
      c10::Float8_e4m3fn::from_bits());
+    #else
+  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+  return static_cast<c10::Float8_e4m3fn>(r);
+    #endif
 }

 template <>
@ -434,7 +446,7 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
 template <>
 __inline__ __device__ uint32_t
 scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
-  __half2_raw h2r =
+  [[maybe_unused]] __half2_raw h2r =
      __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
  union {
    __half2_raw h2r;
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
  int offset_m = blockIdx.y * m_count;
  int offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@ -437,9 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
  #pragma unroll
        for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 = static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2]);
+          FType low16 =
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
          FType high16 =
-              static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
          uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                         (reinterpret_cast<uint32_t&>(high16) << 16);
          int sts_offset =
@ -793,7 +794,7 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
  FT scale_reg[4];
  *(reinterpret_cast<uint2*>(scale_reg)) =
      *(reinterpret_cast<const uint2*>(scales + params_nidx));
-  FT zero_reg[4] = {0};
+  FT zero_reg[4];
  if (zeros != nullptr) {
    *(reinterpret_cast<uint2*>(zero_reg)) =
        *(reinterpret_cast<const uint2*>(zeros + params_nidx));
@ -809,8 +810,10 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
        reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4])));
  #pragma unroll
    for (int ki = 0; ki < 4; ++ki) {
-      fval_reg[ni * 4 + ki] =
-          (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni];
+      if (zeros != nullptr) {
+        fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]);
+      }
+      fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]);
      int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
                       ((ni + lane_id % 4) % 4) * 8;
      smem[sts_offset] = fval_reg[ni * 4 + ki];
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@ -7,6 +7,8 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <iostream>
+#include "../gptq_marlin/marlin_dtypes.cuh"
+using marlin::ScalarType;

 namespace allspark {

@ -66,14 +68,14 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
    return;
  }

-  FType sum(0);
+  float sum = 0.f;

  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
  for (int i = 0; i < n_mat; ++i) {
-    sum += C_split[idx + i * matrix_size];
+    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
  }

-  C[idx] = sum;
+  C[idx] = ScalarType<FType>::float2num(sum);
 }

 template <typename FType>
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(

  constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);

-  __shared__ float shared_qk_max[NWARPS][16 + 1];
-  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
+  [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
  // shared_logits is used for multiple purposes
  __shared__ _B16x4 shared_logits[NWARPS][4][16][4];

@ -426,7 +426,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    [[maybe_unused]] const int kglobal_token_idx =
+        partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -370,7 +370,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
      "bool");
  ops.impl("cutlass_scaled_mm_supports_block_fp8",
-           &cutlass_scaled_mm_supports_fp8);
+           &cutlass_scaled_mm_supports_block_fp8);

  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
  // given capability
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@ -4,6 +4,7 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@ -34,7 +34,8 @@ Further update the model as follows:
            image_features = self.vision_encoder(image_input)
            return self.multi_modal_projector(image_features)

-        def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+        def get_multimodal_embeddings(
+                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

            # Validate the multimodal input keyword arguments
            image_input = self._parse_and_validate_image_input(**kwargs)
@ -61,7 +62,7 @@ Further update the model as follows:
        def get_input_embeddings(
            self,
            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[NestedTensors] = None,
+            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
        ) -> torch.Tensor:

            # `get_input_embeddings` should already be implemented for the language 
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@ -4,9 +4,9 @@

 A Helm chart to deploy vLLM for Kubernetes

-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.

-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.

 ## Prerequisites

--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -4,17 +4,19 @@

 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.

--------
-
-Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother.
-
-* [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples.
-
--------
+Alternatively, you can deploy vLLM to Kubernetes using any of the following:
+* [Helm](frameworks/helm.md)
+* [InftyAI/llmaz](integrations/llmaz.md)
+* [KServe](integrations/kserve.md)
+* [kubernetes-sigs/lws](frameworks/lws.md)
+* [meta-llama/llama-stack](integrations/llamastack.md)
+* [substratusai/kubeai](integrations/kubeai.md)
+* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
+* [vllm-project/production-stack](integrations/production-stack.md)

 ## Pre-requisite

-Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).

 ## Deployment using native K8s

--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@ -419,7 +419,7 @@ List of `v_vec` for one thread
  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
  multiple inner iterations, each warp will process one block of value
  tokens. And with multiple outer iterations, the whole context value
-  tokens are processd
+  tokens are processed

  ```cpp
  float accs[NUM_ROWS_PER_THREAD];
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@ -13,7 +13,7 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
 Metrics in vLLM can be categorized as follows:

 1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
-2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
+2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.

 The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.

@ -47,7 +47,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:tokens_total` (Counter)
 - `vllm:iteration_tokens_total` (Histogram)
 - `vllm:time_in_queue_requests` (Histogram)
- `vllm:model_forward_time_milliseconds` (Histogram
+- `vllm:model_forward_time_milliseconds` (Histogram)
 - `vllm:model_execute_time_milliseconds` (Histogram)
 - `vllm:request_params_n` (Histogram)
 - `vllm:request_params_max_tokens` (Histogram)
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@ -110,7 +110,7 @@ In addition to serving LoRA adapters at server startup, the vLLM server now supp
 LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
 to change models on-the-fly is needed.

-Note: Enabling this feature in production environments is risky as user may participate model adapter management.
+Note: Enabling this feature in production environments is risky as users may participate in model adapter management.

 To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
 is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -162,7 +162,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models

 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](<gh-file:examples/offline_inference/eagle.py>).

 ```python
 from vllm import LLM, SamplingParams
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
@ -15,7 +15,7 @@ more are listed [here](#supported-models).

 By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
 [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
-but they are expected be inferior to models that are specifically trained on embedding tasks.
+but they are expected to be inferior to models that are specifically trained on embedding tasks.

 ______________________________________________________________________

--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@ -119,7 +119,7 @@ If you're observing the following error: `docker: Error response from daemon: Un

 ## Supported configurations

-The following configurations have been validated to be function with
+The following configurations have been validated to function with
 Gaudi2 devices. Configurations that are not listed may or may not work.

 - [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@ -19,7 +19,7 @@ Currently, there are no pre-built OpenVINO wheels.

 ### Build wheel from source

-First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
+First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:

 ```console
 sudo apt-get update  -y
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -189,12 +189,13 @@ vLLM CPU backend supports the following vLLM features:
 - Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
 - Chunked-prefill
 - Prefix-caching
- FP8-E5M2 KV-Caching (TODO)
+- FP8-E5M2 KV cache

 ## Related runtime environment variables

 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).

 ## Performance tips

--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
 As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.

+When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 :::
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -1,6 +1,6 @@
 # Installation

-vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+vLLM initially supports basic model inference and serving on Intel GPU platform.

 :::{attention}
 There are no pre-built wheels or images for this device, so you must build vLLM from source.
@ -65,7 +65,7 @@ $ docker run -it \

 ## Supported features

-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:

 ```console
 python -m vllm.entrypoints.openai.api_server \
@ -78,6 +78,6 @@ python -m vllm.entrypoints.openai.api_server \
     -tp=8
 ```

-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.

-There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
+There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -0,0 +1,159 @@
+# vLLM V1 User Guide
+
+V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
+
+## Why vLLM V1?
+
+vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
+
+Building on V0’s success, vLLM V1 retains the stable and proven components from V0
+(such as the models, GPU kernels, and utilities). At the same time, it significantly
+re-architects the core systems, covering the scheduler, KV cache manager, worker,
+sampler, and API server, to provide a cohesive, maintainable framework that better
+accommodates continued growth and innovation.
+
+Specifically, V1 aims to:
+
+- Provide a **simple, modular, and easy-to-hack codebase**.
+- Ensure **high performance** with near-zero CPU overhead.
+- **Combine key optimizations** into a unified architecture.
+- Require **zero configs** by enabling features/optimizations by default.
+
+We see significant performance improvements from upgrading to V1 core engine, in
+particular for long context scenarios. Please see performance benchmark (To be
+added).
+
+For more details, check out the vLLM V1 blog post [vLLM V1: A Major
+Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025).
+
+This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
+
+### Supports Overview
+#### Hardware
+
+| Hardware | Status                                   |
+|----------|------------------------------------------|
+| **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
+| **AMD**    | <nobr>🚧 WIP</nobr>           |
+| **TPU**    | <nobr>🚧 WIP</nobr>           |
+#### Feature / Model
+
+| Feature / Model | Status |
+|-----------------|-----------------------------------------------------------------------------------|
+| **Prefix Caching**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **Chunked Prefill**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
+| **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
+| **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
+| **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟡 Planned</nobr>                                                           |
+| **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
+| **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
+| **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
+| **Encoder-Decoder Models**                  | <nobr>🟡 Planned</nobr>                                                           |
+| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+
+- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
+- **🟢 Functional**: Fully operational, with ongoing optimizations.  
+- **🚧 WIP**: Under active development.  
+- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
+- **🔴 Deprecated**: Not planned for v1 unless there is strong demand.
+
+**Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
+way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
+allocate a fixed token budget per request, enabling features like chunked prefills,
+prefix caching, and speculative decoding without a strict separation between prefill
+and decode phases.
+
+### Semantic Changes and Deprecated Features
+
+#### Logprobs
+
+vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
+differences compared to V0:
+
+**Logprobs Calculation**
+
+Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
+
+**Prompt Logprobs with Prefix Caching**
+
+Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
+
+#### Deprecated Features
+
+As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+
+**Sampling features**
+
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **Per-Request Logits Processors**: In V0, users could pass custom
+  processing functions to adjust logits on a per-request basis. In vLLM V1, this
+  feature has been deprecated. Instead, the design is moving toward supporting **global logits
+  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+
+**KV Cache features**
+
+- **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
+to handle request preemptions.
+
+**Structured Output features**
+
+- **Request-level Structured Output Backend**: Deprecated, alternative backends
+  (outlines, guidance) with fallbacks is WIP.
+### Feature & Model Support in Progress
+
+Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
+
+#### Features to Be Optimized
+
+These features are already supported in vLLM V1, but their optimization is still
+in progress.
+
+- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
+  inferior to that of V0. The team is actively working on improving its
+  performance
+(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
+
+- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
+  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
+
+#### Features to Be Supported
+
+- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
+
+- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
+  supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
+  Details about the structured outputs can be found
+  [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
+
+#### Models to Be Supported
+
+vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
+and the majority fall into the following categories. V1 support for these models will be added eventually.
+
+**Embedding Models**  
+Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
+
+**Mamba Models**  
+Models using selective state-space mechanisms (instead of standard transformer attention)
+are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
+
+**Encoder-Decoder Models**  
+vLLM V1 is currently optimized for decoder-only transformers. Models requiring
+  cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
+
+For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## FAQ
+
+TODO
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -67,6 +67,8 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
+getting_started/v1_user_guide
+
 :::

 % What does vLLM support?
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -101,7 +101,7 @@ class MyAttention(nn.Module):

  def forward(self, hidden_states, **kwargs): # <- kwargs are required
    ...
-    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
    attn_output, attn_weights = attention_interface(
      self,
      query_states,
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@ -20,7 +20,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b

 ## Running vLLM on a single node

-vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray.

 Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.

@ -29,7 +29,7 @@ To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size`
 ```python
 from vllm import LLM
 llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
-output = llm.generate("San Franciso is a")
+output = llm.generate("San Francisco is a")
 ```

 To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@ -39,7 +39,16 @@ The following metrics are exposed:

 The following metrics are deprecated and due to be removed in a future version:

- *(No metrics are currently deprecated)*
+- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
+  `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
+  used in V1.
+- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
+  counters in V1.
+- `vllm:time_in_queue_requests` because it duplicates
+  `vllm:request_queue_time_seconds`.
+- `vllm:model_forward_time_milliseconds` and
+  `vllm:model_execute_time_milliseconds` because
+  prefill/decode/inference time metrics should be used instead.

 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/docs/source/training/rlhf.md
+++ b/docs/source/training/rlhf.md
@ -1,6 +1,6 @@
 # Reinforcement Learning from Human Feedback

-Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.

 vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).

--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -76,5 +76,10 @@ if __name__ == "__main__":
                             GPUs_per_dp_rank))
        proc.start()
        procs.append(proc)
+    exit_code = 0
    for proc in procs:
        proc.join()
+        if proc.exitcode:
+            exit_code = proc.exitcode
+
+    exit(exit_code)
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--dataset",
+    type=str,
+    default="./examples/data/gsm8k.jsonl",
+    help="downloaded from the eagle repo " \
+    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+)
+parser.add_argument("--max_num_seqs", type=int, default=8)
+parser.add_argument("--num_prompts", type=int, default=80)
+parser.add_argument("--num_spec_tokens", type=int, default=2)
+parser.add_argument("--tp", type=int, default=1)
+parser.add_argument("--draft_tp", type=int, default=1)
+parser.add_argument("--enforce_eager", action='store_true')
+parser.add_argument("--enable_chunked_prefill", action='store_true')
+parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+parser.add_argument("--temp", type=float, default=0)
+
+args = parser.parse_args()
+
+print(args)
+
+model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+
+max_model_len = 2048
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+if os.path.exists(args.dataset):
+    prompts = []
+    num_prompts = args.num_prompts
+    with open(args.dataset) as f:
+        for line in f:
+            data = json.loads(line)
+            prompts.append(data["turns"][0])
+else:
+    prompts = ["The future of AI is", "The president of the United States is"]
+
+prompts = prompts[:args.num_prompts]
+num_prompts = len(prompts)
+
+prompt_ids = [
+    tokenizer.apply_chat_template([{
+        "role": "user",
+        "content": prompt
+    }],
+                                  add_generation_prompt=True)
+    for prompt in prompts
+]
+
+llm = LLM(
+    model=model_dir,
+    trust_remote_code=True,
+    tensor_parallel_size=args.tp,
+    enable_chunked_prefill=args.enable_chunked_prefill,
+    max_num_batched_tokens=args.max_num_batched_tokens,
+    enforce_eager=args.enforce_eager,
+    max_model_len=max_model_len,
+    max_num_seqs=args.max_num_seqs,
+    gpu_memory_utilization=0.8,
+    speculative_model=eagle_dir,
+    num_speculative_tokens=args.num_spec_tokens,
+    speculative_draft_tensor_parallel_size=args.draft_tp,
+    speculative_max_model_len=max_model_len,
+    disable_log_stats=False,
+)
+
+sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+
+outputs = llm.generate(prompt_token_ids=prompt_ids,
+                       sampling_params=sampling_params)
+
+# calculate the average number of accepted tokens per forward pass, +1 is
+# to account for the token from the target model that's always going to be
+# accepted
+acceptance_counts = [0] * (args.num_spec_tokens + 1)
+for output in outputs:
+    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
+        acceptance_counts[step] += count
+
+print(f"mean acceptance length: \
+    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@ -43,12 +43,18 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py advanced


-def run_simple_demo():
+def run_simple_demo(args: argparse.Namespace):
    model_name = "mistralai/Pixtral-12B-2409"
    sampling_params = SamplingParams(max_tokens=8192)

-    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
-    llm = LLM(model=model_name, tokenizer_mode="mistral")
+    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral",
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"
@ -76,7 +82,7 @@ def run_simple_demo():
    print(outputs[0].outputs[0].text)


-def run_advanced_demo():
+def run_advanced_demo(args: argparse.Namespace):
    model_name = "mistralai/Pixtral-12B-2409"
    max_img_per_msg = 5
    max_tokens_per_img = 4096
@ -87,6 +93,7 @@ def run_advanced_demo():
        tokenizer_mode="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    prompt = "Describe the following image."
@ -153,14 +160,19 @@ def main():
        help="Specify the demo mode: 'simple' or 'advanced'",
    )

+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
    args = parser.parse_args()

    if args.mode == "simple":
        print("Running simple demo...")
-        run_simple_demo()
+        run_simple_demo(args)
    elif args.mode == "advanced":
        print("Running advanced demo...")
-        run_advanced_demo()
+        run_advanced_demo(args)


 if __name__ == "__main__":
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@ -42,7 +42,7 @@ def post_http_request(prompt: str,
 def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
-                                     delimiter=b"\0"):
+                                     delimiter=b"\n"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"]
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@ -21,7 +21,7 @@ def http_bot(prompt):

    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
-                                     delimiter=b"\0"):
+                                     delimiter=b"\n"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"][0]
--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
@ -127,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
    "vllm": {
      "handlers": ["vllm"],
      "level": "DEBUG",
-      "propagage": false
+      "propagate": false
    },
    "vllm.example_noisy_logger": {
      "propagate": false
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging",
    "setuptools>=61",
    "setuptools-scm>=8.0",
-    "torch == 2.5.1",
+    "torch == 2.6.0",
    "wheel",
    "jinja2",
 ]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.6.0
 wheel
-jinja2
+jinja2>=3.1.6
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -1,3 +1,4 @@
+cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
@ -38,3 +39,4 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
+ninja # Required for xgrammar, rocm, tpu, xpu
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -4,9 +4,9 @@
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
-ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch == 2.5.1
-torchaudio==2.5.1
+ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+torch==2.6.0
+torchaudio==2.6.0
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
+torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.6.0
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@ -9,6 +9,7 @@ msgspec
 cloudpickle

 # packages to install to build the documentation
+cachetools
 pydantic >= 2.8
 -f https://download.pytorch.org/whl/cpu
 torch
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -7,10 +7,9 @@ torchvision==0.20.1
 torchaudio==2.5.1

 cmake>=3.26
-ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
 wheel
-jinja2
+jinja2>=3.1.6
 amdsmi==6.2.4
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@ -0,0 +1,23 @@
+
+# entrypoints test
+# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+audioread==3.0.1
+cffi==1.17.1
+decorator==5.2.1
+lazy-loader==0.4
+platformdirs==4.3.6
+pooch==1.8.2
+#pycparse==2.22
+soundfile==0.13.1
+soxr==0.5.0.post1
+librosa==0.10.2.post1
+
+# entrypoints test
+#vllm[video] # required by entrypoints/openai/test_video.py
+decord==0.6.0
+
+# entrypoints test
+#sentence-transformers # required by entrypoints/openai/test_score.py
+sentence-transformers==3.4.1
+
+
--- a/requirements/test.in
+++ b/requirements/test.in
@ -8,6 +8,7 @@ pytest-shard

 # testing utils
 awscli
+backoff # required for phi4mm test
 decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
@ -21,8 +22,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.5.1
-torchaudio==2.5.1
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
@ -30,7 +32,7 @@ datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 transformers==4.48.2 
 # quantization
-bitsandbytes>=0.45.0
+bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9

 genai_perf==0.0.8
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -8,7 +8,7 @@ accelerate==1.0.1
    #   peft
 aiohappyeyeballs==2.4.3
    # via aiohttp
-aiohttp==3.10.10
+aiohttp==3.10.11
    # via
    #   datasets
    #   fsspec
@ -33,7 +33,9 @@ audioread==3.0.1
    # via librosa
 awscli==1.35.23
    # via -r requirements/test.in
-bitsandbytes==0.45.0
+backoff==2.2.1
+    # via -r requirements/test.in
+bitsandbytes==0.45.3
    # via -r requirements/test.in
 black==24.10.0
    # via datamodel-code-generator
@ -127,7 +129,6 @@ filelock==3.16.1
    #   ray
    #   torch
    #   transformers
-    #   triton
 fonttools==4.54.1
    # via matplotlib
 frozendict==2.4.6
@ -182,7 +183,7 @@ iniconfig==2.0.0
    # via pytest
 isort==5.13.2
    # via datamodel-code-generator
-jinja2==3.1.4
+jinja2==3.1.6
    # via
    #   datamodel-code-generator
    #   torch
@ -320,6 +321,8 @@ nvidia-cusparse-cu12==12.3.1.170
    # via
    #   nvidia-cusolver-cu12
    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
 nvidia-nccl-cu12==2.21.5
    # via torch
 nvidia-nvjitlink-cu12==12.4.127
@ -591,7 +594,7 @@ timm==1.0.11
    # via -r requirements/test.in
 tokenizers==0.21.0
    # via transformers
-torch==2.5.1
+torch==2.6.0
    # via
    #   -r requirements/test.in
    #   accelerate
@ -607,13 +610,15 @@ torch==2.5.1
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.5.1
+torchaudio==2.6.0
    # via
    #   -r requirements/test.in
    #   encodec
    #   vocos
-torchvision==0.20.1
-    # via timm
+torchvision==0.21.0
+    # via
+    #   -r requirements/test.in
+    #   timm
 tqdm==4.66.6
    # via
    #   datasets
@ -638,7 +643,7 @@ transformers==4.48.2
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.1.0
+triton==3.2.0
    # via torch
 tritonclient==2.51.0
    # via
@ -651,7 +656,6 @@ typepy==1.3.2
    #   tabledata
 typing-extensions==4.12.2
    # via
-    #   bitsandbytes
    #   huggingface-hub
    #   librosa
    #   mistral-common
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -3,11 +3,10 @@

 # Dependencies for TPU
 cmake>=3.26
-ninja
 packaging
 setuptools-scm>=8
 wheel
-jinja2
+jinja2>=3.1.6
 ray[default]
 ray[data]

--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@ -3,12 +3,11 @@

 ray>=2.9
 cmake>=3.26
-ninja
 packaging
 setuptools-scm>=8
 setuptools>=75.8.0
 wheel
-jinja2
+jinja2>=3.1.6
 datasets # for benchmark scripts

 torch==2.6.0+xpu
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

+import os
 import subprocess
 import sys
 import time
@ -44,7 +45,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
        distributed_executor_backend,
    ]

-    uvicorn_process = subprocess.Popen(commands)
+    # API Server Test Requires V0.
+    my_env = os.environ.copy()
+    my_env["VLLM_USE_V1"] = "0"
+    uvicorn_process = subprocess.Popen(commands, env=my_env)
    yield
    uvicorn_process.terminate()

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@ -151,6 +151,10 @@ def uid() -> str:

@pytest_asyncio.fixture(scope="module")
 async def async_engine():
+    # We cannot use monkeypatch since this is a module
+    # scoped fixture and monkeypatch is function scoped.
+    previous_value = os.getenv("VLLM_USE_V1", None)
+    os.environ["VLLM_USE_V1"] = "0"
    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
                                                            func=start_engine)
    try:
@ -161,6 +165,11 @@ async def async_engine():
        await asyncio.sleep(0.1)
        cleanup_dist_env_and_memory()

+        if previous_value:
+            os.environ["VLLM_USE_V1"] = previous_value
+        else:
+            del os.environ["VLLM_USE_V1"]
+

@pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
+    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@ -63,7 +64,8 @@ def test_models(
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")

-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)

        # 5042 tokens for gemma2
        # gemma2 has alternating sliding window size of 4096
@ -80,7 +82,8 @@ def test_models(
                        dtype=dtype,
                        enforce_eager=enforce_eager,
                        gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

        check_outputs_equal(
            outputs_0_lst=hf_outputs,
@ -104,6 +107,7 @@ def test_models(
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
+    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    vllm_runner,
    example_prompts,
@ -116,13 +120,17 @@ def test_models_distributed(
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")

+    with monkeypatch.context() as monkeypatch_context:
        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
            # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")

        if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+            monkeypatch_context.setenv(
+                "VLLM_ATTENTION_BACKEND",
+                attention_backend,
+            )

        dtype = "half"
        max_tokens = 5
@ -130,13 +138,16 @@ def test_models_distributed(
        # NOTE: take care of the order. run vLLM first, and then run HF.
        # vLLM needs a fresh new process without cuda initialization.
        # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
+        # will hurt multiprocessing backend with fork method
+        # (the default method).
+        with vllm_runner(
+                model,
                dtype=dtype,
                tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
+                distributed_executor_backend=distributed_executor_backend,
        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -7,22 +7,39 @@ prefill requests are chunked.

 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING

 import pytest

-from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test

+if TYPE_CHECKING:
+    from .conftest import HfRunner, VllmRunner
+
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-3.2-1B-Instruct",
 ]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the file.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@ -33,8 +50,8 @@ MODELS = [
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@ -43,13 +60,14 @@ def test_models(
    enforce_eager: bool,
    tensor_parallel_size: int,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Checks exact match decode between huggingface model and vllm runner with
    chunked prefill.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)

        max_num_seqs = chunked_prefill_token_size
        max_num_batched_tokens = chunked_prefill_token_size
@ -66,7 +84,8 @@ def test_models(
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

        check_outputs_equal(
            outputs_0_lst=hf_outputs,
@ -81,21 +100,21 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    override_backend_env_variable(monkeypatch, attention_backend)
-
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        if (model == "meta-llama/Llama-3.2-1B-Instruct"
                and distributed_executor_backend == "ray"):
            # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")

        dtype = "half"
        max_tokens = 5
@ -110,7 +129,8 @@ def test_models_distributed(
        # NOTE: take care of the order. run vLLM first, and then run HF.
        # vLLM needs a fresh new process without cuda initialization.
        # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
+        # will hurt multiprocessing backend with
+        # fork method (the default method).

        with vllm_runner(
                model,
@ -121,7 +141,10 @@ def test_models_distributed(
                max_num_batched_tokens=max_num_batched_tokens,
                distributed_executor_backend=distributed_executor_backend,
        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(
+                example_prompts,
+                max_tokens,
+            )

        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@ -149,7 +172,7 @@ def test_models_distributed(
 # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    kv_cache_dtype: str,
    model: str,
@ -209,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
@ -245,8 +268,10 @@ def test_with_prefix_caching(
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                              max_tokens)
+                outputs[enable] += vllm_model.generate_greedy(
+                    [prompt],
+                    max_tokens,
+                )

    check_outputs_equal(
        outputs_0_lst=outputs[False],
@ -257,7 +282,7 @@ def test_with_prefix_caching(


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False])
@ -265,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@ -274,7 +299,7 @@ def test_models_cpu(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    test_models(
        hf_runner,
@ -294,11 +319,11 @@ def test_models_cpu(
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@ -1,8 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0

+import pytest
+
 from ..utils import compare_two_settings


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def test_cpu_offload():
    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                         ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -123,9 +123,9 @@ def test_cumem_with_cudagraph():
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])
-def test_end_to_end(model: str, use_v1: bool):
-    import os
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
        free, total = torch.cuda.mem_get_info()
        used_bytes_baseline = total - free  # in case other process is running
        llm = LLM(model, enable_sleep_mode=True)
@ -158,5 +158,3 @@ def test_end_to_end(model: str, use_v1: bool):

        # cmp output
        assert output[0].outputs[0].text == output2[0].outputs[0].text
-
-    del os.environ["VLLM_USE_V1"]
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@ -21,6 +21,15 @@ MODELS = [
 ]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
+    so use VLLM_USE_V1=0 for all tests in the file.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
@pytest.fixture(scope="module", autouse=True)
 def check_settings():
    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@ -6,6 +6,7 @@ from typing import Callable, Union
 from torch import fx

 from vllm.compilation.inductor_pass import InductorPass
+from vllm.config import get_current_vllm_config


 class TestBackend:
@ -17,13 +18,14 @@ class TestBackend:
    Inductor config can be modified directly by editing the inductor_config
    property. This can be helpful for adding passes like the
    'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
+    Inductor config is default-initialized from VllmConfig.CompilationConfig.
    """

    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
                                                             None]]):
        self.custom_passes = list(passes)
-        from torch._inductor import config
-        self.inductor_config = config.shallow_copy_dict()
+        compile_config = get_current_vllm_config().compilation_config
+        self.inductor_config = compile_config.inductor_compile_config
        self.inductor_config['force_disable_caches'] = True
        self.inductor_config['post_grad_custom_post_pass'] = self.post_pass

--- a/tests/compile/conftest.py
+++ b/tests/compile/conftest.py
@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+# TEST V1: this should be removed. Right now V1 overrides
+# all the torch compile logic. We should re-enable this
+# as we add torch compile support back to V1.
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations

 import dataclasses
-from typing import Optional

 import pytest

@ -22,8 +22,11 @@ class TestSetting:
    fullgraph: bool


-# representative settings for testing
-test_settings = [
+# we cannot afford testing the full Catesian product
+# of all models and all levels
+@pytest.mark.parametrize(
+    "test_setting",
+    [
        # basic llama model
        TestSetting(
            model="meta-llama/Llama-3.2-1B-Instruct",
@ -84,13 +87,11 @@ test_settings = [
            method="generate_with_image",
            fullgraph=False,
        ),
-]
-
-
-# we cannot afford testing the full Catesian product
-# of all models and all levels
-@pytest.mark.parametrize("test_setting", test_settings)
-def test_compile_correctness(test_setting: TestSetting):
+    ])
+def test_compile_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_setting: TestSetting,
+):
    # this test is run under multiple suits, with different GPUs.
    # make sure we only run the test with correct CUDA devices.
    # don't use "<", as it will duplicate the tests.
@ -103,13 +104,17 @@ def test_compile_correctness(test_setting: TestSetting):
    fullgraph = test_setting.fullgraph
    if cuda_device_count_stateless() != pp_size * tp_size:
        pytest.skip("Not correct CUDA devices for the test.")
-    import os
-    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
-    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
-                ["-tp", str(tp_size)]
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        final_args = [
+            "--enforce-eager", *model_args, "-pp",
+            str(pp_size), "-tp",
+            str(tp_size)
+        ]

        all_args: list[list[str]] = []
-    all_envs: list[Optional[dict[str, str]]] = []
+        all_envs: list[dict[str, str] | None] = []

        for level in [
                CompilationLevel.NO_COMPILATION,
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -1,22 +1,115 @@
 # SPDX-License-Identifier: Apache-2.0

-import pytest
+from __future__ import annotations

+from typing import Any
+
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
+from vllm.platforms import current_platform

 from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS, check_full_graph_support


-@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.fixture(params=None, name="model_info")
+def models_list_fixture(request):
+    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
+        ("facebook/opt-125m", {}),
+        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+            "quantization": "compressed-tensors"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", {}),
+    ]
+
+    if is_quant_method_supported("aqlm"):
+        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+            "quantization": "aqlm"
+        }))
+
+    # TODO: figure out why this fails.
+    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+            "quantization": "gguf"
+        }))
+
+    if is_quant_method_supported("gptq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+            "quantization": "gptq"
+        }))
+
+    if is_quant_method_supported("gptq_marlin"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+            "quantization": "gptq_marlin"
+        }))
+
+    if is_quant_method_supported("gptq_marlin_24"):
+        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+            "quantization": "gptq_marlin_24"
+        }))
+
+    if is_quant_method_supported("marlin"):
+        TEST_MODELS.append(
+            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                "quantization": "marlin"
+            }))
+
+    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+            "quantization": "AWQ"
+        }))
+
+    return TEST_MODELS
+
+
@pytest.mark.parametrize(
    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+)
+@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test
-def test_full_graph(model_info, optimization_level):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1)
+def test_full_graph(
+    monkeypatch: pytest.MonkeyPatch,
+    model_info: tuple[str, dict[str, Any]],
+    optimization_level: int,
+):
+    model, model_kwargs = model_info
+
+    with monkeypatch.context() as m:
+        # make sure these models can be captured in full graph mode
+        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+        print(f"MODEL={model}")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(
+            model=model,
+            enforce_eager=True,
+            tensor_parallel_size=1,
+            disable_custom_all_reduce=True,
+            compilation_config=optimization_level,
+            **model_kwargs,
+        )
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@ -1,93 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import torch
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-TEST_MODELS = [
-    ("facebook/opt-125m", {}),
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-        "quantization": "compressed-tensors"
-    }),
-    ("meta-llama/Llama-3.2-1B-Instruct", {}),
-]
-
-if is_quant_method_supported("aqlm"):
-    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-        "quantization": "aqlm"
-    }))
-
-# TODO: figure out why this fails.
-if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-        "quantization": "gguf"
-    }))
-
-if is_quant_method_supported("gptq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-        "quantization": "gptq"
-    }))
-
-if is_quant_method_supported("gptq_marlin"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-        "quantization": "gptq_marlin"
-    }))
-
-if is_quant_method_supported("gptq_marlin_24"):
-    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-        "quantization": "gptq_marlin_24"
-    }))
-
-if is_quant_method_supported("marlin"):
-    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-        "quantization": "marlin"
-    }))
-
-if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-        "quantization": "AWQ"
-    }))
-
-
-def check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1):
-    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-
-    print(f"MODEL={model}")
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=model,
-              enforce_eager=True,
-              tensor_parallel_size=tp_size,
-              disable_custom_all_reduce=True,
-              compilation_config=optimization_level,
-              **model_kwargs)
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -111,6 +111,26 @@ VIDEO_ASSETS = _VideoAssets()
 """Singleton instance of :class:`_VideoAssets`."""


+@pytest.fixture(scope="function", autouse=True)
+def cleanup_VLLM_USE_V1(monkeypatch):
+    """
+    The V1 oracle sets "VLLM_USE_V1" during loading. This means
+    that each invocation of a test change the env variable.
+
+    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
+    made during the test run by vLLM will be cleaned up.
+
+    This fixture is used by every test.
+    """
+
+    # If VLLM_USE_V1 is not set, set then delete. This will
+    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
+    # if VLLM modifies the value of envs.VLLM_USE_V1.
+    if "VLLM_USE_V1" not in os.environ:
+        monkeypatch.setenv("VLLM_USE_V1", "")
+        monkeypatch.delenv("VLLM_USE_V1")
+
+
@pytest.fixture(params=[True, False])
 def run_with_both_engines(request, monkeypatch):
    # Automatically runs tests twice, once with V1 and once without
@ -661,6 +681,17 @@ def hf_runner():


 class VllmRunner:
+    """
+    The default value of some arguments have been modified from
+    :class:`~vllm.LLM` as follows:
+    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
+    - `seed`: Set to `0` instead of `None` for test reproducibility.
+    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
+    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
+    - `enable_chunked_prefill`: Set to `False` instead of `None` for
+      test reproducibility.
+    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
+    """

    def __init__(
        self,
@ -668,6 +699,8 @@ class VllmRunner:
        task: TaskOption = "auto",
        tokenizer_name: Optional[str] = None,
        tokenizer_mode: str = "auto",
+        trust_remote_code: bool = True,
+        seed: Optional[int] = 0,
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
        max_model_len: int = 1024,
@ -675,7 +708,7 @@ class VllmRunner:
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
-        enable_chunked_prefill: bool = False,
+        enable_chunked_prefill: Optional[bool] = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
        **kwargs,
@ -685,8 +718,9 @@ class VllmRunner:
            task=task,
            tokenizer=tokenizer_name,
            tokenizer_mode=tokenizer_mode,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
            dtype=dtype,
+            seed=seed,
            swap_space=swap_space,
            enforce_eager=enforce_eager,
            disable_log_stats=disable_log_stats,
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/engine/output_processor/init.py
+++ b/tests/engine/output_processor/init.py
--- a/tests/detokenizer/conftest.py
+++ b/tests/detokenizer/conftest.py
@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@ -6,6 +6,7 @@ from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams


+@pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
    # This test checks if the engine generates completions both with and
--- a/tests/engine/output_processor/test_stop_checker.py
+++ b/tests/engine/output_processor/test_stop_checker.py
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import pytest
+
+from vllm import LLM, SamplingParams, envs
+
+MODEL = "meta-llama/llama-2-7b-hf"
+MAX_TOKENS = 200
+
+
+def _test_stopping(llm: LLM,
+                   expected_output: str,
+                   expected_reason: Any,
+                   stop: Optional[list[str]] = None,
+                   stop_token_ids: Optional[list[int]] = None,
+                   include_in_output: bool = False) -> None:
+    output = llm.generate(
+        "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ))[0].outputs[0]
+
+    assert output is not None
+    assert output.text == expected_output
+    assert output.stop_reason == expected_reason
+
+
+def _set_async_mode(llm, is_async):
+    llm.llm_engine.scheduler[0].use_async_output_proc = is_async
+
+
+def _stop_basic(llm):
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=".")
+
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization.",
+                   expected_reason=".")
+
+
+def _stop_multi_tokens(llm):
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization. We are a ",
+        expected_reason="group of peo")
+
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=True,
+        expected_output=
+        "VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo")
+
+
+def _stop_partial_token(llm):
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer or",
+                   expected_reason="gani")
+
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organi",
+                   expected_reason="gani")
+
+
+def _stop_token_id(llm):
+    # token id 13013 => " organization"
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer",
+                   expected_reason=13013)
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=13013)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_strings():
+    # If V0, must set enforce_eager=False since we use
+    # async output processing below.
+    vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+
+    if envs.VLLM_USE_V1:
+        _stop_basic(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_basic(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_basic(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        _stop_multi_tokens(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_multi_tokens(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_multi_tokens(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        _stop_partial_token(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_partial_token(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_partial_token(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        # FIXME: this does not respect include_in_output=False
+        # _stop_token_id(vllm_model)
+        pass
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_token_id(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_token_id(vllm_model)
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -3,7 +3,10 @@

 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import Any, Callable

 import pytest
 import ray
@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel


@ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_reduce_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,


@ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_gather_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,


@ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
+def broadcast_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,


@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,


@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
-                          distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
    all_reduce_test_worker, all_gather_test_worker,
    broadcast_tensor_dict_test_worker
 ])
-def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_parallel(tp_size, 1, test_target)
+def test_multi_process_tensor_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, tp_size, 1, test_target)


@pytest.mark.skipif(torch.cuda.device_count() < 2,
@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(pp_size, test_target):
-    multi_process_parallel(1, pp_size, test_target)
+def test_multi_process_pipeline_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    pp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, 1, pp_size, test_target)


@pytest.mark.skipif(torch.cuda.device_count() < 4,
@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
    broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel_pipeline_parallel(
-        tp_size, pp_size, test_target):
-    multi_process_parallel(tp_size, pp_size, test_target)
+    tp_size: int,
+    pp_size: int,
+    test_target: Callable[..., Any],
+    monkeypatch: pytest.MonkeyPatch,
+):
+    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0

-import os
 import random

 import pytest
@ -23,8 +22,15 @@ for i, v in enumerate(test_sizes):


@ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def graph_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
@ -79,8 +85,15 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):


@ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def eager_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
@ -110,8 +123,14 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+def test_custom_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+                           test_target)
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -24,6 +24,18 @@ logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    For PP, we fall back to V0 by default. This means
+    that the TP baseline runs with V1 while the PP engine
+    runs with V0. This gives divergent results with dummy
+    weights. Once we enable V1 by default for PP, we can
+    remove this.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
@ -214,7 +226,7 @@ MULTIMODAL_MODELS = {
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
-    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
@ -237,7 +249,7 @@ TEST_MODELS = [
    "BAAI/bge-multilingual-gemma2",
    # [MULTIMODAL GENERATION]
    "OpenGVLab/InternVL2-1B",
-    "microsoft/Phi-3-vision-128k-instruct",
+    "microsoft/Phi-3.5-vision-instruct",
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
    # [LANGUAGE GENERATION - HYBRID ARCH]
    "ai21labs/Jamba-tiny-dev",
@ -338,6 +350,10 @@ def _compare_tp(
    else:
        pp_env = None

+    tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+
    pp_args = [
        *common_args,
        "--pipeline-parallel-size",
@ -362,14 +378,20 @@ def _compare_tp(
    ]

    try:
-        compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
+        compare_two_settings(model_id,
+                             pp_args,
+                             tp_args,
+                             pp_env,
+                             tp_env,
+                             method=method)
    except Exception:
-        if pp_env is None:
-            raise
-        else:
-            # Ray Compiled Graph tests are flaky,
+        testing_ray_compiled_graph = pp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
            # so we don't want to fail the test
            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise


@pytest.mark.parametrize(
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@ -7,15 +7,17 @@ import pytest
 from vllm.distributed.utils import get_pp_indices


-def test_custom_layer_partition():
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:

        def _verify(partition_str, num_layers, pp_size, goldens):
            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
            for pp_rank, golden in enumerate(goldens):
                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
            if bak is not None:
-            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)

        # Even partition
        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@ -55,6 +57,10 @@ def test_custom_layer_partition():
        (5, 3, 1, (2, 4)),
        (5, 3, 2, (4, 5)),
    ])
-def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
-                               pp_rank: int, indices: tuple[int, int]):
+def test_uneven_auto_partition(
+    num_hidden_layers: int,
+    pp_size: int,
+    pp_rank: int,
+    indices: tuple[int, int],
+):
    assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations

-import os
+from typing import TYPE_CHECKING

 import pytest

 from ..utils import compare_two_settings, fork_new_process_for_each_test

+if TYPE_CHECKING:
+    from typing_extensions import LiteralString
+

@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
    (2, "JackFram/llama-160m"),
@ -15,7 +19,13 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
    "FLASHINFER",
 ])
@fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+def test_pp_cudagraph(
+    monkeypatch: pytest.MonkeyPatch,
+    PP_SIZE: int,
+    MODEL_NAME: str,
+    ATTN_BACKEND: LiteralString,
+):
+    with monkeypatch.context() as m:
        cudagraph_args = [
            # use half precision for speed and memory savings in CI environment
            "--dtype",
@ -25,7 +35,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
            "--distributed-executor-backend",
            "mp",
        ]
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)

        eager_args = cudagraph_args + ["--enforce-eager"]

--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@ -21,6 +21,15 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
 ]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def vllm_to_hf_output(
    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
    decoder_prompt_type: DecoderPromptType,
--- a/tests/engine/conftest.py
+++ b/tests/engine/conftest.py
@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/engine/test_multi_step_output_processor.py
+++ b/tests/engine/test_multi_step_output_processor.py
@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.utils import Counter

-from ...core.utils import create_seq_group
+from ..core.utils import create_seq_group


@pytest.mark.parametrize("seq_output_len", [128])
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@ -1,165 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Any, Optional
-
-import pytest
-
-from vllm import CompletionOutput, LLMEngine, SamplingParams
-
-MODEL = "meta-llama/llama-2-7b-hf"
-MAX_TOKENS = 200
-
-IS_ASYNC = False
-
-
-@pytest.fixture(scope="session")
-def vllm_model(vllm_runner):
-    with vllm_runner(MODEL) as vllm_model:
-        yield vllm_model
-
-
-def _test_stopping(llm_engine: LLMEngine,
-                   expected_output: str,
-                   expected_reason: Any,
-                   stop: Optional[list[str]] = None,
-                   stop_token_ids: Optional[list[int]] = None,
-                   include_in_output: bool = False,
-                   use_async_output_proc: bool = False) -> None:
-    llm_engine.add_request(
-        "id", "A story about vLLM:\n",
-        SamplingParams(
-            temperature=0.0,
-            max_tokens=MAX_TOKENS,
-            stop=stop,
-            stop_token_ids=stop_token_ids,
-            include_stop_str_in_output=include_in_output,
-        ), None)
-
-    output: Optional[CompletionOutput] = None
-    output_text = ""
-    stop_reason = None
-
-    if use_async_output_proc:
-        llm_engine.step()
-
-    while llm_engine.has_unfinished_requests():
-        (request_output, ) = llm_engine.step()
-        (output, ) = request_output.outputs
-
-        # Ensure we don't backtrack
-        assert output.text.startswith(output_text)
-        output_text = output.text
-        stop_reason = output.stop_reason
-
-    assert output is not None
-    assert output_text == expected_output
-    assert stop_reason == expected_reason
-
-
-def _set_async_mode(llm_engine, is_async):
-    llm_engine.scheduler[0].use_async_output_proc = is_async
-
-
-def _stop_basic(llm_engine, is_async):
-    _test_stopping(llm_engine,
-                   stop=["."],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=".",
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop=["."],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization.",
-                   expected_reason=".",
-                   use_async_output_proc=is_async)
-
-
-def _stop_multi_tokens(llm_engine, is_async):
-    _test_stopping(
-        llm_engine,
-        stop=["group of peo", "short"],
-        include_in_output=False,
-        expected_output="VLLM is a 100% volunteer organization. We are a ",
-        expected_reason="group of peo",
-        use_async_output_proc=is_async)
-
-    _test_stopping(
-        llm_engine,
-        stop=["group of peo", "short"],
-        include_in_output=True,
-        expected_output=
-        "VLLM is a 100% volunteer organization. We are a group of peo",
-        expected_reason="group of peo",
-        use_async_output_proc=is_async)
-
-
-def _stop_partial_token(llm_engine, is_async):
-    _test_stopping(llm_engine,
-                   stop=["gani"],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer or",
-                   expected_reason="gani",
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop=["gani"],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organi",
-                   expected_reason="gani",
-                   use_async_output_proc=is_async)
-
-
-def _stop_token_id(llm_engine, is_async):
-    # token id 13013 => " organization"
-
-    _test_stopping(llm_engine,
-                   stop_token_ids=[13013],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer",
-                   expected_reason=13013,
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop_token_ids=[13013],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=13013,
-                   use_async_output_proc=is_async)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_basic(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_basic(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_basic(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_multi_tokens(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_partial_token(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_partial_token(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_partial_token(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_token_id(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_token_id(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_token_id(vllm_model.model.llm_engine, is_async=False)
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@ -49,7 +49,7 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
        run_test(more_args)


-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V0 Engine."""

    with monkeypatch.context() as m:
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@ -3,12 +3,21 @@
 import sys
 from contextlib import nullcontext

+import pytest
 from vllm_test_utils import BlameResult, blame

 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    V1 only supports xgrammar so this is irrelevant.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def run_normal_opt125m():
    prompts = [
        "Hello, my name is",
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@ -53,21 +53,29 @@ def cache_models():

@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
    # Set HF to offline mode and ensure we can still construct an LLM
+    with monkeypatch.context() as m:
        try:
-        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
-        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")

            def disable_connect(*args, **kwargs):
                raise RuntimeError("No http calls allowed")

-        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
-                            disable_connect)
-        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
-                            disable_connect)
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )

-        # Need to re-import huggingface_hub and friends to setup offline mode
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
            _re_import_modules()
            # Cached model files should be used in offline mode
            for model_config in MODEL_CONFIGS:
@ -75,10 +83,7 @@ def test_offline_mode(monkeypatch):
        finally:
            # Reset the environment after the test
            # NB: Assuming tests are run in online mode
-        monkeypatch.delenv("HF_HUB_OFFLINE")
-        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
            _re_import_modules()
-        pass


 def _re_import_modules():
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):


@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+                                    more_args):
    """Run with the V0 Engine."""

    with monkeypatch.context() as m:
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@ -171,3 +171,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
            extra_headers={
                "Content-Type": "application/x-www-form-urlencoded"
            })
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--enable-server-load-tracking"],
+                     id="enable-server-load-tracking")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_server_load(server: RemoteOpenAIServer):
+    # Check initial server load
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
+
+    def make_long_completion_request():
+        return requests.post(
+            server.url_for("v1/completions"),
+            headers={"Content-Type": "application/json"},
+            json={
+                "prompt": "Give me a long story",
+                "max_tokens": 1000,
+                "temperature": 0,
+            },
+        )
+
+    # Start the completion request in a background thread.
+    completion_future = asyncio.create_task(
+        asyncio.to_thread(make_long_completion_request))
+
+    # Give a short delay to ensure the request has started.
+    await asyncio.sleep(0.1)
+
+    # Check server load while the completion request is running.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 1
+
+    # Wait for the completion request to finish.
+    await completion_future
+    await asyncio.sleep(0.1)
+
+    # Check server load after the completion request has finished.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Lily Liu	8d6cf89526	[V1] [Spec Decode] Support random sampling for spec decode (#13933 ) Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-16 22:00:20 -07:00
Simon Mo	583a9778e0	[Benchmark] Do not save detailed info to json by default (#14879 ) Signed-off-by: simon-mo <simon.mo@hey.com>	2025-03-16 21:48:11 -07:00
Sibi	a73e183e36	[Misc] Replace os environ to monkeypatch in test suite (#14516 ) Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>	2025-03-16 20:35:57 -07:00
Lucas Wilkinson	1e799b7ec1	[BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910 )	2025-03-17 03:35:37 +00:00
Woosuk Kwon	7f6c5ee06c	[V1][Minor] Add __repr__ to ConstantList (#14907 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-16 20:20:15 -07:00
Woosuk Kwon	faa0275730	[V1] Optimize the overhead of rewinding (#14905 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-16 20:19:30 -07:00
Cyrus Leung	8a5a9b70d7	[CI/Build] Update defaults for test reproducibility (#14893 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-17 10:38:15 +08:00
Robert Shaw	bb3aeddfaf	[CI] Nightly Tests (#14898 ) Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>	2025-03-17 02:06:43 +00:00
Robert Shaw	aecc780dba	[V1] Enable Entrypoints Tests (#14903 )	2025-03-16 17:56:16 -07:00
Vadim Gimpelson	90df7f23aa	[Doc] Add guidance for using `ccache` with `pip install -e .` in doc (#14901 )	2025-03-16 23:10:04 +00:00
Rui Qiao	b9b5bdfc7d	[Misc] Catching Ray Compiled Graph PP test failures for V1 (#14847 )	2025-03-16 15:46:42 -07:00
Woosuk Kwon	31060b2757	[V1][BugFix] Detect interleaved sliding window attention (#14896 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-16 14:53:53 -07:00
Nick Hill	fc1f67715d	[BugFix][V1] Fix overhead related to bad_words sampling when not in use (#14894 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-03-16 14:53:34 -07:00
Cyrus Leung	f6137adbcb	Revert "[Bugfix] Limit profiling run sequence length by max_model_len (#14785 ) (#14892 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-16 09:13:46 -07:00
Cyrus Leung	e53b1350f2	[Bugfix] Explicitly disable Phi-4-multimodal in V1 (#14889 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-16 09:05:40 -07:00
Kyle Sayers	d30aa7e9e6	[Bugfix] Limit profiling run sequence length by max_model_len (#14785 ) Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>	2025-03-16 07:44:19 -07:00
Lily Liu	d1ad2a57af	[V1] [Spec Decode] Fix ngram tests (#14878 )	2025-03-16 00:29:22 -07:00
Nick Hill	b82662d952	[BugFix] Fix torch distributed stateless PG backend init (#14870 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-03-15 20:26:19 -07:00
Simon Mo	71c1e07107	[Kernel] Add more tuned configs (#14877 ) Signed-off-by: simon-mo <simon.mo@hey.com>	2025-03-15 20:25:03 -07:00
Roger Wang	b30c75dda4	[V1] Remove V0 fallback for mistral-tokenizer (#14873 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-03-15 20:21:11 -07:00
Isotr0py	def232e122	[VLM] Clean up Phi-4-MM ViT implementation (#14812 ) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-03-15 18:53:52 -07:00
Roger Wang	3453b964a3	[Misc][Doc] Minor benchmark README update (#14874 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-03-16 09:46:17 +08:00
Rémi Delacourt	61c6a5a796	[VLM] Merged multi-modal processor for Pixtral (#12211 ) Signed-off-by: remi <remi@mistral.ai> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-15 06:28:27 -07:00
Jun Duan	74bc397b0a	[Core] Expose API endpoint `/is_sleeping` (#14312 ) Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>	2025-03-15 06:28:14 -07:00
Kunshang Ji	f58aea002c	[CI][Intel GPU] refine intel GPU ci docker build (#14860 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>	2025-03-15 11:58:53 +00:00
Cyrus Leung	3556a41434	[VLM] Limit multimodal input cache by memory (#14805 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-15 02:52:05 -07:00
Bryan Lu	9ed6ee92d6	[Bugfix] EAGLE output norm bug (#14464 ) Signed-off-by: Bryan Lu <yuzhelu@amazon.com>	2025-03-15 06:50:33 +00:00
Russell Bryant	ee3778d5fc	[Build/CI] Upgrade jinja2 to get 3 moderate CVE fixes (#14839 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-15 05:38:19 +00:00
Jennifer Zhao	aaacf17324	[Doc] V1 user guide (#13991 ) Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com> Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-03-14 22:17:59 -07:00
Aaron Pham	4c7629cae9	[V1][Structured Output] calculate vocab_size eagerly (#14851 ) Signed-off-by: Aaron Pham <contact@aarnphm.xyz>	2025-03-14 22:09:51 -07:00
Jee Jee Li	e0fdfa1608	[CI/Build] Delete LoRA bias test (#14849 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-03-14 22:09:25 -07:00
Lucas Wilkinson	5952d8ab61	[Attention] Get rid of mla cache alignment (#14842 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-03-15 05:08:25 +00:00
Li, Jiang	a2ae496589	[CPU] Support FP8 KV cache (#14741 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-03-14 22:07:36 -07:00
Simon Mo	877e352262	[Docs] Add new East Coast vLLM Meetup slides to README and meetups.md (#14852 )	2025-03-14 22:06:38 -07:00
Robert Shaw	d4d93db2c5	[V1] V1 Enablement Oracle (#13726 ) Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>	2025-03-14 22:02:20 -07:00
Lu Fang	8c0d15d5c5	[Misc][Easy] Annotate unused vars in the csrc files (#14798 ) Signed-off-by: Lu Fang <lufang@fb.com>	2025-03-15 12:40:09 +08:00
Isotr0py	97ac781c62	[Misc] Remove misleading message in gemma2 and gemma3 (#14850 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-03-14 21:35:12 -07:00
Russell Bryant	776dcec8fe	Disable outlines cache by default (#14837 )	2025-03-15 03:57:55 +00:00
Tyler Michael Smith	ccf02fcbae	Revert "[Model] Mamba2 Prefill Performance Tweaks: Fixing Flurry of U… (#14848 )	2025-03-14 20:45:42 -07:00
DefTruth	acaea3bb07	[Bugfix][V1] Fix flashinfer sampling (#14815 )	2025-03-14 20:42:38 -07:00
Liangfu Chen	9f37422779	[Neuron][CI] update docker run command (#14829 ) Signed-off-by: Liangfu Chen <liangfc@amazon.com>	2025-03-14 18:51:35 -07:00
yarongmu-google	dd344e0342	[Bugfix] Fix torch_xla in V0 which can't handle None seed introduced … (#14844 ) Signed-off-by: Yarong Mu <ymu@google.com>	2025-03-15 00:41:15 +00:00
Yuan Tang	54a8804455	[Doc] More neutral K8s deployment guide (#14084 ) Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>	2025-03-14 16:12:36 -07:00
Russell Bryant	bbd94a19fc	[Build/CI] Upgrade aiohttp to incldue CVE fix (#14840 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-14 23:11:28 +00:00
Russell Bryant	233ffce1eb	[Build/CI] Move ninja to common deps (#14835 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-14 21:25:28 +00:00
Richard Liu	40677783aa	[CI] Add TPU v1 test (#14834 ) Signed-off-by: Richard Liu <ricliu@google.com>	2025-03-14 17:13:30 -04:00
Michael Goin	14f301b541	Update to torch==2.6.0 (#12721 ) Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: luka <luka@neuralmagic.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-03-14 16:58:30 -04:00
Russell Bryant	46f98893dd	[V1] Fix model parameterization for structured output tests (#14833 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-14 20:55:18 +00:00
Chih-Chieh Yang	fe66b34728	[Model] Mamba2 Prefill Performance Tweaks: Fixing Flurry of Unnecessary Memory Copies (#14778 ) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>	2025-03-14 16:36:18 -04:00
Alexei-V-Ivanov-AMD	270a5da495	Re-enable the AMD Entrypoints Test (#14711 ) Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>	2025-03-14 12:18:13 -07:00
Kevin H. Luu	7097b4cc1c	[release] Remove log cleanup commands from TPU job (#14838 )	2025-03-14 11:59:52 -07:00
Yajie Wang	977a16772c	[Bugfix][Kernel]: Fix AllSpark kernel compilation errors and enable for CUDA < 12.0 (#14430 ) Signed-off-by: wyj371990 <wyj371990@alibaba-inc.com>	2025-03-14 09:55:14 -07:00
daniel-salib	73deea2fdb	[Frontend] track server_load (#13950 )	2025-03-14 09:53:17 -07:00
Mark McLoughlin	9d2b4a70f4	[V1][Metrics] Updated list of deprecated metrics in v0.8 (#14695 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-03-15 00:45:25 +08:00
Russell Bryant	0b0d6421b2	[Frontend] Fix log message to use http vs https (#14774 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-14 09:21:09 -07:00
Russell Bryant	1140991a7b	[V1] Fix vocab size calculation for structured output (#14826 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-03-14 09:18:38 -07:00
Cyrus Leung	613c5bb945	[Bugfix] Fix Aria test loading (#14823 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-14 09:11:23 -07:00
Guillaume Calmettes	fd8e055ffb	[BugFix]: properly catch templating error when preprocess input (#13976 ) Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>	2025-03-14 05:58:34 -07:00
Cyrus Leung	ab93f1360f	[VLM] Various cleanup and fixes (#14806 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-14 05:58:19 -07:00
DefTruth	40253bab44	[Bugfix][W8A8] fixed cutlass block fp8 binding (#14796 )	2025-03-14 03:32:42 -07:00
Woosuk Kwon	c77620d22d	[V1][Minor] Minor code cleanup for scheduling metrics (#14800 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-14 08:21:28 +00:00
Jee Jee Li	989ecd2007	[Misc] Gemma3ForConditionalGeneration supports LoRA (#14797 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-03-14 01:07:30 -07:00
WeiCheng	54cc46f3eb	[Bugfix] Fix small typo in the example of Streaming delimiter (#14793 )	2025-03-14 08:05:17 +00:00
Cyrus Leung	601bd3268e	[Misc] Clean up type annotation for `SupportsMultiModal` (#14794 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-03-14 00:59:56 -07:00
Li Wang	09269b3127	[BugFix]Fix performance serving benchmark when enable profiling (#14737 ) Signed-off-by: wangli <wangli858794774@gmail.com>	2025-03-14 07:02:05 +00:00
Thien Tran	27b50f1fe6	[Bugfix][Kernel][CPU] Fix num_tokens in CPU rotary embedding kernel (#14667 ) Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>	2025-03-13 23:47:49 -07:00
Lucas Wilkinson	9532c49836	[Attention] MLA get rid of materialization (#14770 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-03-13 23:39:02 -07:00
Roger Wang	0c2af17c76	[CI] Fix missing example model id in processor test (#14787 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-03-14 13:52:15 +08:00
Jennifer Zhao	a6e0d096dd	[Feature] Add visionarena offline support for benchmark_throughput (#14654 ) Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com> Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>	2025-03-14 04:07:54 +00:00
Liangfu Chen	d3d4956261	[Neuron] flatten test parameterization for neuron attention kernels (#14712 )	2025-03-13 20:46:56 -07:00
Nick Hill	4059adc31b	[Misc][Minor] Simplify `SamplingParams.__post_init__()` (#14772 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-03-14 11:44:20 +08:00
Kevin H. Luu	f1f632d9ec	[ci] Reduce number of tests in fastcheck (#14782 )	2025-03-13 20:43:45 -07:00
Thien Tran	95d680b862	[Bugfix][IPEX] Add `VLLM_CPU_MOE_PREPACK` to allow disabling MoE prepack when CPU does not support it (#14681 ) Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>	2025-03-13 20:43:18 -07:00
Thomas Parnell	fb4c7f8ef0	[Kernel] [V1] Further optimizations to ROCm (Triton) Backend to better handle GQA. (#14431 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com>	2025-03-13 20:42:27 -07:00
Varun Sundar Rabindranath	0b1cfa6180	[Kernel] LoRA - Enable CUDAGraphs for V1 (#14626 ) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>	2025-03-13 20:42:04 -07:00
Woosuk Kwon	32ef4983cd	[V1] Temporarily disable FlashInfer Rejection Sampler (#14788 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-03-13 20:40:35 -07:00
Roger Wang	ad19c8a003	[V1] Move OOM check into sampler run (#14728 ) Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Simon Mo <simon.mo@hey.com>	2025-03-13 20:40:23 -07:00
Jeff Daily	2a602b055a	forward fix PR 14245, restore build on ROCm 6.2 (#14709 ) Signed-off-by: Jeff Daily <jeff.daily@amd.com>	2025-03-13 20:40:15 -07:00
Alexander Matveev	7888e1d0a3	[V1] TPU - Enable prefix caching by default (#14773 )	2025-03-13 20:40:05 -07:00
Chen Zhang	60c872d4b6	[Doc] Fix small typo in Transformers fallback (#14791 ) Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2025-03-13 20:33:12 -07:00
yasu52	3fb17d26c8	[Doc] Fix typo in documentation (#14783 ) Signed-off-by: yasu52 <tsuguro4649@gmail.com>	2025-03-13 20:33:09 -07:00
Lucas Wilkinson	d47807ba08	[Attention] Remove slow setattr in MLA (#14769 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-03-13 21:31:14 +00:00
afeldman-nm	02fcaa3d0a	[V1] Detokenizer: Respect Stop Tokens + not include_stop_str_in_output (#14624 ) Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>	2025-03-13 19:07:34 +00:00
Aaron Pham	8a4a2efc6f	[V1][Core] using cached vocab_size for Structured Outputs (#14630 ) Signed-off-by: Aaron Pham <contact@aarnphm.xyz>	2025-03-13 11:39:28 -07:00