Updates docs with correction about default cuda version

Correct 12.1 --> 12.4
[Doc]Add documentation for using EAGLE in vLLM (#11417 )
2025-10-21 15:43:52 +08:00 · 2025-01-07 17:29:07 -05:00 · 2025-01-07 19:19:12 +00:00 · 2025-01-07 18:36:34 +00:00 · 2025-01-07 17:04:28 +00:00 · 2025-01-07 21:50:58 +08:00
365 changed files with 13566 additions and 7691 deletions
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -1,5 +1,6 @@
 steps:
  - label: "Wait for container to be ready"
+    key: wait-for-container-image
    agents:
      queue: A100
    plugins:
@ -10,12 +11,11 @@ steps:
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh

-  - wait
-
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
+    depends_on: wait-for-container-image
    plugins:
    - kubernetes:
        podSpec:
@ -49,6 +49,7 @@ steps:
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
+    depends_on: wait-for-container-image
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -73,7 +74,7 @@ steps:
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
-    depends_on: block-h100
+    depends_on: wait-for-container-image
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container

 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2

 function cpu_tests() {
  set -e
  export NUMA_NODE=$2

  # offline inference
-  docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
    python3 examples/offline_inference.py"

  # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pip install pytest pytest-asyncio \
      decord einops librosa peft Pillow sentence-transformers soundfile \
@ -46,26 +46,26 @@ function cpu_tests() {
    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

  # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

  # Run AWQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_ipex_quant.py"

  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v -k cpu_model \
    tests/basic_correctness/test_chunked_prefill.py"  

  # online inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    export VLLM_CPU_KVCACHE_SPACE=10 
    export VLLM_CPU_OMP_THREADS_BIND=$1
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -3,6 +3,18 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"

 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
+        docker image prune -f
        docker system prune -f
+        rm -rf "${HF_MOUNT:?}/*"
+        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t neuron -f Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .

 # Setup cleanup
-remove_docker_container() { docker rm -f neuron || true; }
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
-remove_docker_container

 # Run the image
-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
-
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -106,14 +106,12 @@ steps:
  source_file_dependencies:
  - vllm/
  commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s entrypoints/test_chat_utils.py
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -244,7 +242,7 @@ steps:
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
  parallelism: 4

 - label: "PyTorch Fullgraph Smoke Test" # 9min
@ -333,8 +331,6 @@ steps:
  - vllm/
  - tests/models
  commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
    - pytest -v -s models/test_registry.py
    - pytest -v -s models/test_initialization.py

@ -360,23 +356,25 @@ steps:
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/language -m 'not core_model'

- label: Multi-Modal Models Test (Standard) # 28min
+- label: Multi-Modal Models Test (Standard) # 40min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
  - tests/models/encoder_decoder/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model

- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+- label: Multi-Modal Models Test (Extended) 1 # 48m
  optional: true
  source_file_dependencies:
  - vllm/
@ -469,11 +467,28 @@ steps:
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py

+- label: Plugin Tests (2 GPUs) # 40min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  fast_check: true
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+
 - label: Multi-step Tests (4 GPUs) # 36min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -520,6 +535,7 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
--- a/.github/ISSUE_TEMPLATE/500-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).

-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
+        GIT_TAG v3.6.0
        GIT_PROGRESS TRUE

        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW FALSE
+        GIT_SHALLOW TRUE
    )
  endif()
  FetchContent_MakeAvailable(cutlass)
@ -550,7 +550,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
+          GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/13
+++ b/13
@ -234,8 +234,8 @@ RUN mv vllm test_docs/
 #################### TEST IMAGE ####################

 #################### OPENAI API SERVER ####################
-# openai api server alternative
-FROM vllm-base AS vllm-openai
+# base openai image with additional requirements, for any subsequent openai-style images
+FROM vllm-base AS vllm-openai-base

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \

 ENV VLLM_USAGE_SOURCE production-docker-image

+# define sagemaker first, so it is not default from `docker build`
+FROM vllm-openai-base AS vllm-sagemaker
+
+COPY examples/sagemaker-entrypoint.sh .
+RUN chmod +x sagemaker-entrypoint.sh
+ENTRYPOINT ["./sagemaker-entrypoint.sh"]
+
+FROM vllm-openai-base AS vllm-openai
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"

 FROM $BASE_IMAGE

@ -15,16 +15,17 @@ RUN apt-get update && \
        ffmpeg libsm6 libxext6 libgl1

 ### Mount Point ###
-# When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+# When launching the container, mount the code directory to /workspace
+ARG APP_MOUNT=/workspace
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm

 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install pytest

 COPY . .
 ARG GIT_REPO_CHECK=0
@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils

+# overwrite entrypoint to run bash script
+RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
+
 CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@ -77,7 +77,7 @@ pip install vllm
 Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
 - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
 - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)

 ## Contributing

--- a/SECURITY.md
+++ b/SECURITY.md
@ -4,7 +4,7 @@

 If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.

-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).

 ---

--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -0,0 +1,184 @@
+"""
+Offline benchmark to test the long document QA throughput.
+
+Example usage:
+    # This command run the vllm with 50GB CPU memory for offloading
+    # The workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+
+import dataclasses
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+
+
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to 
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+
+    Returns:
+        A list of repeated prompts in the specified order.
+
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == 'random':
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == 'tile':
+        return prompts * repeat_count
+    elif mode == 'interleave':
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(f"Invalid mode: {mode}, only support "
+                         "'random', 'tile', 'interleave'")
+
+
+def main(args):
+    random.seed(args.shuffle_seed)
+
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+
+    warmup_prompts = [
+        "This is warm up request " + str(i) + \
+                ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)]
+
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description=
+        'Benchmark the performance with or without automatic prefix caching.')
+
+    parser.add_argument(
+        '--document-length',
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help='Range of input lengths for sampling prompts,'
+        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--num-documents',
+                        type=int,
+                        default=8,
+                        help='Range of input lengths for sampling prompts,'
+                        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--output-len', type=int, default=10)
+
+    parser.add_argument('--repeat-count',
+                        type=int,
+                        default=2,
+                        help='Number of times to repeat each prompt')
+
+    parser.add_argument("--repeat-mode",
+                        type=str,
+                        default='random',
+                        help='The mode to repeat prompts. The supported '
+                        'modes are "random", "tile", and "interleave". '
+                        'See repeat_prompts() in the source code for details.')
+
+    parser.add_argument("--shuffle-seed",
+                        type=int,
+                        default=0,
+                        help='Random seed when the repeat mode is "random"')
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@ -53,7 +53,7 @@ void paged_attention_v1_launcher(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
    const int blocksparse_vert_stride, const int blocksparse_block_size,
    const int blocksparse_head_sliding_step) {
@ -176,7 +176,7 @@ void paged_attention_v1(
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
    int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, double k_scale, double v_scale,
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@ -54,7 +54,7 @@ void paged_attention_v2_launcher(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
    const int blocksparse_vert_stride, const int blocksparse_block_size,
    const int blocksparse_head_sliding_step) {
@ -187,7 +187,7 @@ void paged_attention_v2(
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
    int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, double k_scale, double v_scale,
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes) {
+    const std::optional<torch::Tensor>& alibi_slopes) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
@ -459,7 +459,7 @@ void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, double k_scale, double v_scale,
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
+    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
@ -781,7 +781,7 @@ void paged_attention_v2(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, double k_scale, double v_scale,
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
                    const torch::Tensor& b,         // [IC, OC], column-major
                    const torch::Tensor& a_scales,  // [1] or [M]
                    const torch::Tensor& b_scales,  // [1] or [OC]
-                    const c10::optional<torch::Tensor>& bias  // [OC]
+                    const std::optional<torch::Tensor>& bias  // [OC]
 ) {
  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
  // Checks for conformality
@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
                        const torch::Tensor& a_scales,            // [1] or [M]
                        const torch::Tensor& b_scales,            // [1] or [OC]
                        const torch::Tensor& azp_adj,             // [OC]
-                        const c10::optional<torch::Tensor>& azp,  // [1] or [M]
-                        const c10::optional<torch::Tensor>& bias  // [OC]
+                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
+                        const std::optional<torch::Tensor>& bias  // [OC]
 ) {
  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
  // Checks for conformality
@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                              const torch::Tensor& input,  // [..., hidden_size]
                              const torch::Tensor& scale,
-                              c10::optional<torch::Tensor> const& azp) {
+                              std::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [..., hidden_size]
    const torch::Tensor& input,  // [..., hidden_size]
    torch::Tensor& scale,        // [..., 1]
-    c10::optional<torch::Tensor> const& azp) {
+    std::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids);
 void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                    const torch::Tensor& b, const torch::Tensor& a_scales,
                    const torch::Tensor& b_scales,
-                    const c10::optional<torch::Tensor>& bias);
+                    const std::optional<torch::Tensor>& bias);

 void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                        const torch::Tensor& b, const torch::Tensor& a_scales,
                        const torch::Tensor& b_scales,
                        const torch::Tensor& azp_adj,
-                        const c10::optional<torch::Tensor>& azp,
-                        const c10::optional<torch::Tensor>& bias);
+                        const std::optional<torch::Tensor>& azp,
+                        const std::optional<torch::Tensor>& bias);

 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@ -68,7 +68,7 @@ struct ScaledEpilogueBase {
  // This overload handles the case where there might not be a tensor, in which
  // case a nullptr is passed and a constant (0) is used.
  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
    static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
    using Arguments = typename Descriptor::Arguments;
    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
  static ArgumentType prepare_args(torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales,
                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken
                                   torch::Tensor const& b_scales,
                                   torch::Tensor const& azp_adj,
                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@ -67,7 +67,7 @@ struct ScaledEpilogueBase {
  // This overload handles the case where there might not be a tensor, in which
  // case a nullptr is passed and a constant (0) is used.
  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
    using Arguments = typename Descriptor::Arguments;
    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
  static ArgumentType prepare_args(torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales,
                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken
                                   torch::Tensor const& b_scales,
                                   torch::Tensor const& azp_adj,
                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
--- a/csrc/cutlass_extensions/torch_utils.hpp
+++ b/csrc/cutlass_extensions/torch_utils.hpp
@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,

 template <typename Stride>
 static inline auto maybe_make_cute_layout(
-    c10::optional<torch::Tensor> const& tensor,
+    std::optional<torch::Tensor> const& tensor,
    std::string_view name = "tensor") {
  using Layout = decltype(make_cute_layout<Stride>(*tensor));

--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum):


 class MixedInputKernelScheduleType(enum.Enum):
-    TmaWarpSpecializedMixedInput = enum_auto()
-    TmaWarpSpecializedPingpongMixedInput = enum_auto()
-    TmaWarpSpecializedCooperativeMixedInput = enum_auto()
+    TmaWarpSpecialized = enum_auto()
+    TmaWarpSpecializedPingpong = enum_auto()
+    TmaWarpSpecializedCooperative = enum_auto()


 VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
@ -68,11 +68,11 @@ VLLMKernelScheduleTag: Dict[Union[
    MixedInputKernelScheduleType, KernelScheduleType], str] = {
        **KernelScheduleTag,  # type: ignore
        **{
-            MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput",
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
+            "cutlass::gemm::KernelTmaWarpSpecialized",
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
        }
    }
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase &params,
                         const at::Tensor x,
                         const at::Tensor weight,
                         const at::Tensor out,
-                         const c10::optional<at::Tensor>& bias,
+                         const std::optional<at::Tensor>& bias,
                         bool silu_activation,
                         int64_t pad_slot_id,
-                         const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
-                         const c10::optional<at::Tensor>& cache_indices = std::nullopt,
-                         const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
+                         const std::optional<at::Tensor>& query_start_loc = std::nullopt,
+                         const std::optional<at::Tensor>& cache_indices = std::nullopt,
+                         const std::optional<at::Tensor>& has_initial_state = std::nullopt) {

    // Reset the parameters
    memset(&params, 0, sizeof(params));
@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase &params,


 void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
-                  const c10::optional<at::Tensor> &bias_,
-                  const c10::optional<at::Tensor> &conv_states,
-                  const c10::optional<at::Tensor> &query_start_loc,
-                  const c10::optional<at::Tensor> &cache_indices,
-                  const c10::optional<at::Tensor> &has_initial_state,
+                  const std::optional<at::Tensor> &bias_,
+                  const std::optional<at::Tensor> &conv_states,
+                  const std::optional<at::Tensor> &query_start_loc,
+                  const std::optional<at::Tensor> &cache_indices,
+                  const std::optional<at::Tensor> &has_initial_state,
                  bool silu_activation,
                 // used to identify padding entries if cache_indices provided
                 // in case of padding, the kernel will return early
@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
 void causal_conv1d_update(const at::Tensor &x,
                     const at::Tensor &conv_state,
                     const at::Tensor &weight,
-                     const c10::optional<at::Tensor> &bias_,
+                     const std::optional<at::Tensor> &bias_,
                     bool silu_activation,
-                     const c10::optional<at::Tensor> &cache_seqlens_,
-                     const c10::optional<at::Tensor> &conv_state_indices_,
+                     const std::optional<at::Tensor> &cache_seqlens_,
+                     const std::optional<at::Tensor> &conv_state_indices_,
                     // used to identify padding entries if cache_indices provided
                     // in case of padding, the kernel will return early
                     int64_t pad_slot_id) {
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                        const torch::Tensor out,
                        const torch::Tensor z,
                        const torch::Tensor out_z,
-                        const c10::optional<at::Tensor>& D,
-                        const c10::optional<at::Tensor>& delta_bias,
+                        const std::optional<at::Tensor>& D,
+                        const std::optional<at::Tensor>& delta_bias,
                        const torch::Tensor ssm_states,
                        bool has_z, 
                        bool delta_softplus,
-                        const c10::optional<at::Tensor>& query_start_loc,
-                        const c10::optional<at::Tensor>& cache_indices,
-                        const c10::optional<at::Tensor>& has_initial_state,
+                        const std::optional<at::Tensor>& query_start_loc,
+                        const std::optional<at::Tensor>& cache_indices,
+                        const std::optional<at::Tensor>& has_initial_state,
                        bool varlen,
                        int64_t pad_slot_id) {

@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase &params,

 void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                  const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
-                  const c10::optional<torch::Tensor> &D_,
-                  const c10::optional<torch::Tensor> &z_,
-                  const c10::optional<torch::Tensor> &delta_bias_,
+                  const std::optional<torch::Tensor> &D_,
+                  const std::optional<torch::Tensor> &z_,
+                  const std::optional<torch::Tensor> &delta_bias_,
                  bool delta_softplus,
-                  const c10::optional<torch::Tensor> &query_start_loc,
-                  const c10::optional<torch::Tensor> &cache_indices,
-                  const c10::optional<torch::Tensor> &has_initial_state,
+                  const std::optional<torch::Tensor> &query_start_loc,
+                  const std::optional<torch::Tensor> &cache_indices,
+                  const std::optional<torch::Tensor> &has_initial_state,
                  const torch::Tensor &ssm_states,
                  // used to identify padding entries if cache_indices provided
                  // in case of padding, the kernel will return early
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -33,7 +33,7 @@ void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, double k_scale, double v_scale,
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@ -44,7 +44,7 @@ void paged_attention_v2(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, double k_scale, double v_scale,
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                       torch::Tensor const& b, torch::Tensor const& a_scales,
                       torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias);
+                       std::optional<torch::Tensor> const& bias);

 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                           torch::Tensor const& b,
                           torch::Tensor const& a_scales,
                           torch::Tensor const& b_scales,
                           torch::Tensor const& azp_adj,
-                           c10::optional<torch::Tensor> const& azp,
-                           c10::optional<torch::Tensor> const& bias);
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias);

 bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);

@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                              torch::Tensor const& b, torch::Tensor const& e,
                              torch::Tensor const& a_scales,
                              torch::Tensor const& b_scales,
-                              c10::optional<torch::Tensor> const& bias);
+                              std::optional<torch::Tensor> const& bias);

 bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
                                   torch::Tensor& e, torch::Tensor const& a);
@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,

 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& scale,
-                              c10::optional<torch::Tensor> const& azp);
+                              std::optional<torch::Tensor> const& azp);

 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor& scales,
-                               c10::optional<torch::Tensor> const& azp);
+                               std::optional<torch::Tensor> const& azp);

 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,

 void dynamic_per_token_scaled_fp8_quant(
    torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
-    c10::optional<torch::Tensor> const& scale_ub);
+    std::optional<torch::Tensor> const& scale_ub);

 void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const torch::Tensor& A, const torch::Tensor& B,
                        const torch::Tensor& C,
-                        const c10::optional<torch::Tensor>& D_,
-                        const c10::optional<torch::Tensor>& z_,
-                        const c10::optional<torch::Tensor>& delta_bias_,
+                        const std::optional<torch::Tensor>& D_,
+                        const std::optional<torch::Tensor>& z_,
+                        const std::optional<torch::Tensor>& delta_bias_,
                        bool delta_softplus,
-                        const c10::optional<torch::Tensor>& query_start_loc,
-                        const c10::optional<torch::Tensor>& cache_indices,
-                        const c10::optional<torch::Tensor>& has_initial_state,
+                        const std::optional<torch::Tensor>& query_start_loc,
+                        const std::optional<torch::Tensor>& cache_indices,
+                        const std::optional<torch::Tensor>& has_initial_state,
                        const torch::Tensor& ssm_states, int64_t pad_slot_id);

 void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
                          const at::Tensor& weight,
-                          const c10::optional<at::Tensor>& bias_,
+                          const std::optional<at::Tensor>& bias_,
                          bool silu_activation,
-                          const c10::optional<at::Tensor>& cache_seqlens_,
-                          const c10::optional<at::Tensor>& conv_state_indices_,
+                          const std::optional<at::Tensor>& cache_seqlens_,
+                          const std::optional<at::Tensor>& conv_state_indices_,
                          int64_t pad_slot_id);

 void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
-                       const c10::optional<at::Tensor>& bias_,
-                       const c10::optional<at::Tensor>& conv_states,
-                       const c10::optional<at::Tensor>& query_start_loc,
-                       const c10::optional<at::Tensor>& cache_indices,
-                       const c10::optional<at::Tensor>& has_initial_state,
+                       const std::optional<at::Tensor>& bias_,
+                       const std::optional<at::Tensor>& conv_states,
+                       const std::optional<at::Tensor>& query_start_loc,
+                       const std::optional<at::Tensor>& cache_indices,
+                       const std::optional<at::Tensor>& has_initial_state,
                       bool silu_activation, int64_t pad_slot_id);

 #ifndef USE_ROCM
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                              torch::Tensor const& input,  // [..., hidden_size]
                              torch::Tensor const& scale,
-                              c10::optional<torch::Tensor> const& azp) {
+                              std::optional<torch::Tensor> const& azp) {
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scale.numel() == 1);
@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
 void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [..., hidden_size]
    torch::Tensor const& input,  // [..., hidden_size]
-    torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
+    torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scales.is_contiguous());
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
  if (bias) {
@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
  if (bias) {
@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
  if (bias) {
@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
  if (bias) {
@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);

 void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);

 void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);

 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 #endif

 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);

 void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& b,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);

 void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& b,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);

 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 #endif

 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                       torch::Tensor const& b, torch::Tensor const& a_scales,
                       torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias) {
+                       std::optional<torch::Tensor> const& bias) {
  // Checks for conformality
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                           torch::Tensor const& a_scales,
                           torch::Tensor const& b_scales,
                           torch::Tensor const& azp_adj,
-                           c10::optional<torch::Tensor> const& azp,
-                           c10::optional<torch::Tensor> const& bias) {
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias) {
  // Checks for conformality
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -834,6 +834,7 @@ __global__ void Marlin(
  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  int4* sh_red = sh_s + (stages * s_sh_stage);

  // Register storage for double buffer of shared memory reads.
  FragA frag_a[2][thread_m_blocks];
@ -932,11 +933,11 @@ __global__ void Marlin(
          int4* sh_s_stage = sh_s + s_sh_stage * pipe;

          if constexpr (group_blocks >= thread_k_blocks) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
+            if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
              s_gl_rd += s_gl_rd_delta;
            }
          } else {
@ -1038,9 +1039,7 @@ __global__ void Marlin(
      // No act-order case
      if constexpr (group_blocks != -1) {
        if constexpr (group_blocks >= thread_k_blocks) {
-          int4* sh_s_stage =
-              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                   (pipe / (group_blocks / thread_k_blocks)));
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
        } else {
          int warp_id = threadIdx.x / 32;
@ -1339,15 +1338,15 @@ __global__ void Marlin(
              int red_sh_wr =
                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
              if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+                float* c_rd = reinterpret_cast<float*>(
+                    &sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
  #pragma unroll
                for (int k = 0; k < 4; k++)
                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                      c_rd[k] + c_wr[k];
              }
-              sh[red_sh_wr] =
+              sh_red[red_sh_wr] =
                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
            }
          }
@ -1357,7 +1356,7 @@ __global__ void Marlin(
  #pragma unroll
          for (int i = 0; i < 4 * 2; i++) {
            float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
  #pragma unroll
            for (int j = 0; j < 4; j++)
              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
@ -1397,7 +1396,7 @@ __global__ void Marlin(
  #pragma unroll
        for (int i = 0; i < thread_m_blocks * 4; i++) {
          cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &sh_red[c_sh_wr + c_sh_wr_delta * i],
              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                 c_gl_wr_delta_i * (i % 2)],
              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
@ -1410,7 +1409,7 @@ __global__ void Marlin(
      for (int i = 0; i < thread_m_blocks * 4; i++) {
        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
          if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
  #pragma unroll
            for (int j = 0; j < 2 * 4; j++) {
              reinterpret_cast<float*>(
@ -1461,10 +1460,10 @@ __global__ void Marlin(
      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
  #pragma unroll
      for (int k = 0; k < th_size; k++) {
-        sh[threadIdx.x] =
+        sh_red[threadIdx.x] =
            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];

-        float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
  #pragma unroll
        for (int f = 0; f < 4; f++) {
          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
@ -1515,7 +1514,7 @@ __global__ void Marlin(
        res = __hmul2(res, s[0]);
      }

-      ((scalar_t2*)sh)[idx] = res;
+      ((scalar_t2*)sh_red)[idx] = res;
    };

    if (threadIdx.x / 32 < thread_n_blocks / 4) {
@ -1543,7 +1542,7 @@ __global__ void Marlin(
         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
         i++) {
      if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh[c_sh_rd];
+        C[c_gl_wr] = sh_red[c_sh_rd];
        c_gl_wr += c_gl_wr_delta;
        c_sh_rd += c_sh_rd_delta;
      }
@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,

  float pipe_size = (a_size + b_size) * pipe_stages;

+  float reduce_size = max(th_config.num_threads * 32 * 4,
+                          (tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
+
  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity

-  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+  return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
 }

 bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -63,7 +63,7 @@ torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {


 static inline std::optional<at::ScalarType> maybe_scalartype(
-    c10::optional<at::Tensor> const& t) {
+    std::optional<at::Tensor> const& t) {
    if (!t) {
      return std::nullopt;
    } else {
@ -189,7 +189,7 @@ using Kernel_{{type_sig}} = MacheteKernelTemplate<
  {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
  {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
  {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
-  cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput,
+  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
  Sch>;

 {% for sch in schs %}
@ -223,7 +223,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
        {{DataTypeTag[t.convert]}}, // ElementConvert
        {{DataTypeTag[t.accumulator]}}, // Accumulator
        cutlass::layout::ColumnMajor,
-        cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
    >(args.B); 
  }
  {%- endfor %}
@ -239,7 +239,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
 }; // namespace machete
 """

-TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput
+TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
 TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative


@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
 # mostly unique shorter sch_sig
 def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
    kernel_terse_names_replace = {
-        "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_",
+        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
        "TmaWarpSpecializedCooperative_": "TmaCoop_",
        "StreamKScheduler": "streamK",
    }
--- a/csrc/quantization/machete/machete_collective_builder.cuh
+++ b/csrc/quantization/machete/machete_collective_builder.cuh
@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder<
    ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
    KernelScheduleType,
    cute::enable_if_t<(
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedPingpongMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedCooperativeMixedInput>)>> {
+                        KernelTmaWarpSpecializedCooperative>)>> {
  using CollectiveOp = machete::MacheteCollectiveMma<
      ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
      AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
      StageCountType, KernelScheduleType>;
 };

-};  // namespace cutlass::gemm::collective
+};  // namespace cutlass::gemm::collective
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@ -66,13 +66,11 @@ struct MacheteCollectiveMma {
  using Schedule = KernelScheduleType;
  static_assert(
      cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
-          cute::is_same_v<Schedule, KernelTmaWarpSpecializedMixedInput> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedPingpongMixedInput> ||
          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedCooperativeMixedInput>,
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
      "KernelSchedule must be one of the warp specialized policies");

 public:
@ -113,8 +111,7 @@ struct MacheteCollectiveMma {
  // For coop schedules we have two warp groups cooperatively issuing wgmma
  // instructions so we use 2 atoms along the M dim (one for each warpgroup)
  using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelScheduleType,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;

  using TiledMma = decltype(cute::make_tiled_mma(
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@ -183,11 +183,11 @@ struct MacheteKernelTemplate {
      torch::Tensor const& A,  // MxK matrix
      torch::Tensor const& B,  // KxN prepacked matrix
      torch::Tensor& D,        // MxN matrix
-      c10::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
-      c10::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
-      c10::optional<int64_t> maybe_group_size,
-      c10::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
-      c10::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
+      std::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
+      std::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
+      std::optional<int64_t> maybe_group_size,
+      std::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
+      std::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
  {
    static_assert(!with_group_zeropoints || with_group_scales);

--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@ -13,23 +13,23 @@ struct MMArgs {
  torch::Tensor const& A;
  torch::Tensor const& B;
  vllm::ScalarType const& b_type;
-  c10::optional<at::ScalarType> const& maybe_out_type;
-  c10::optional<torch::Tensor> const& maybe_group_scales;
-  c10::optional<torch::Tensor> const& maybe_group_zeros;
-  c10::optional<int64_t> maybe_group_size;
-  c10::optional<torch::Tensor> const& maybe_channel_scales;
-  c10::optional<torch::Tensor> const& maybe_token_scales;
-  c10::optional<std::string> maybe_schedule;
+  std::optional<at::ScalarType> const& maybe_out_type;
+  std::optional<torch::Tensor> const& maybe_group_scales;
+  std::optional<torch::Tensor> const& maybe_group_zeros;
+  std::optional<int64_t> maybe_group_size;
+  std::optional<torch::Tensor> const& maybe_channel_scales;
+  std::optional<torch::Tensor> const& maybe_token_scales;
+  std::optional<std::string> maybe_schedule;
 };

 struct SupportedSchedulesArgs {
  at::ScalarType a_type;
  vllm::ScalarType b_type;
-  c10::optional<at::ScalarType> maybe_group_scales_type;
-  c10::optional<at::ScalarType> maybe_group_zeros_type;
-  c10::optional<at::ScalarType> maybe_channel_scales_type;
-  c10::optional<at::ScalarType> maybe_token_scales_type;
-  c10::optional<at::ScalarType> maybe_out_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_zeros_type;
+  std::optional<at::ScalarType> maybe_channel_scales_type;
+  std::optional<at::ScalarType> maybe_token_scales_type;
+  std::optional<at::ScalarType> maybe_out_type;
 };

 torch::Tensor mm_dispatch(MMArgs args);
--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@ -10,7 +10,7 @@ struct PrepackBArgs {
  torch::Tensor const& B;
  at::ScalarType a_type;
  vllm::ScalarType b_type;
-  c10::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
 };

 template <typename PrepackedLayoutB>
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate {
  // For coop schedules we have two warp groups cooperatively issuing wgmma
  // instructions so we use 2 atoms along the M dim (one for each warpgroup)
  using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelSchedule,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;

  using TiledMma = decltype(cute::make_tiled_mma(
@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate {
  }
 };

-};  // namespace machete
+};  // namespace machete
--- a/csrc/quantization/machete/machete_pytorch.cu
+++ b/csrc/quantization/machete/machete_pytorch.cu
@ -10,11 +10,11 @@ using namespace vllm;

 std::vector<std::string> supported_schedules(
    at::ScalarType a_type, int64_t b_type_id,
-    c10::optional<at::ScalarType> maybe_group_scales_type,
-    c10::optional<at::ScalarType> maybe_group_zeros_type,
-    c10::optional<at::ScalarType> maybe_channel_scales_type,
-    c10::optional<at::ScalarType> maybe_token_scales_type,
-    c10::optional<at::ScalarType> maybe_out_type) {
+    std::optional<at::ScalarType> maybe_group_scales_type,
+    std::optional<at::ScalarType> maybe_group_zeros_type,
+    std::optional<at::ScalarType> maybe_channel_scales_type,
+    std::optional<at::ScalarType> maybe_token_scales_type,
+    std::optional<at::ScalarType> maybe_out_type) {
  ScalarType const b_type = ScalarType::from_id(b_type_id);
  return supported_schedules_dispatch({
      .a_type = a_type,
@ -29,13 +29,13 @@ std::vector<std::string> supported_schedules(

 torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
                 int64_t b_type_id,
-                 c10::optional<at::ScalarType> const& maybe_out_type,
-                 c10::optional<torch::Tensor> const& maybe_group_scales,
-                 c10::optional<torch::Tensor> const& maybe_group_zeros,
-                 c10::optional<int64_t> maybe_group_size,
-                 c10::optional<torch::Tensor> const& maybe_channel_scales,
-                 c10::optional<torch::Tensor> const& maybe_token_scales,
-                 c10::optional<std::string> maybe_schedule) {
+                 std::optional<at::ScalarType> const& maybe_out_type,
+                 std::optional<torch::Tensor> const& maybe_group_scales,
+                 std::optional<torch::Tensor> const& maybe_group_zeros,
+                 std::optional<int64_t> maybe_group_size,
+                 std::optional<torch::Tensor> const& maybe_channel_scales,
+                 std::optional<torch::Tensor> const& maybe_token_scales,
+                 std::optional<std::string> maybe_schedule) {
  ScalarType const b_type = ScalarType::from_id(b_type_id);
  return mm_dispatch({.A = A,
                      .B = B,
@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,

 torch::Tensor prepack_B(
    torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
-    c10::optional<at::ScalarType> const& maybe_group_scales_type) {
+    std::optional<at::ScalarType> const& maybe_group_scales_type) {
  ScalarType const b_type = ScalarType::from_id(b_type_id);
  return prepack_B_dispatch(
      {.B = B,
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -928,7 +928,7 @@ void paged_attention_custom_launcher(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, const int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
    float k_scale, float v_scale) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
@ -1086,7 +1086,7 @@ void paged_attention(
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& context_lens,  // [num_seqs]
    int64_t block_size, int64_t max_context_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, double k_scale, double v_scale) {
  const int head_size = query.size(2);
  if (kv_cache_dtype == "auto") {
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                     double scale, torch::Tensor& block_tables,
                     torch::Tensor& context_lens, int64_t block_size,
                     int64_t max_context_len,
-                     const c10::optional<torch::Tensor>& alibi_slopes,
+                     const std::optional<torch::Tensor>& alibi_slopes,
                     const std::string& kv_cache_dtype, double k_scale,
                     double v_scale);
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                                   torch::Tensor const& bt_meta,
                                   torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
  if (bias) {
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                                   torch::Tensor const& e,
                                   torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales,
-                                   c10::optional<torch::Tensor> const& bias);
+                                   std::optional<torch::Tensor> const& bias);
 #endif

 void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
                              torch::Tensor const& bt_meta,
                              torch::Tensor const& a_scales,
                              torch::Tensor const& b_scales,
-                              c10::optional<torch::Tensor> const& bias) {
+                              std::optional<torch::Tensor> const& bias) {
  // Checks for conformality
  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 requests
+zmq
--- a/docs/source/assets/deployment/architecture_helm_deployment.png
+++ b/docs/source/assets/deployment/architecture_helm_deployment.png
--- a/docs/source/assets/features/disagg_prefill/abstraction.jpg
+++ b/docs/source/assets/features/disagg_prefill/abstraction.jpg
--- a/docs/source/assets/features/disagg_prefill/overview.jpg
+++ b/docs/source/assets/features/disagg_prefill/overview.jpg
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -191,6 +191,7 @@ def linkcode_resolve(domain, info):

 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
+    "blake3",
    "compressed_tensors",
    "cpuinfo",
    "cv2",
@ -207,7 +208,7 @@ autodoc_mock_imports = [
    "tensorizer",
    "pynvml",
    "outlines",
-    "xgrammar,"
+    "xgrammar",
    "librosa",
    "soundfile",
    "gguf",
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@ -1,7 +1,7 @@
 # Dockerfile

 We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
-More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
+More information about deploying with Docker can be found [here](#deployment-docker).

 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:

@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph

 The edges of the build graph represent:

- FROM ... dependencies (with a solid line and a full arrow head)
+- `FROM ...` dependencies (with a solid line and a full arrow head)

- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)

- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)

  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
  > :align: center
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@ -0,0 +1,115 @@
+(new-model-basic)=
+
+# Basic Implementation
+
+This guide walks you through the steps to implement a basic vLLM model.
+
+## 1. Bring your model code
+
+First, clone the PyTorch model code from the source repository.
+For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
+
+```{warning}
+Make sure to review and adhere to the original code's copyright and licensing terms!
+```
+
+## 2. Make your code compatible with vLLM
+
+To ensure compatibility with vLLM, your model must meet the following requirements:
+
+### Initialization Code
+
+All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
+
+- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
+- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
+
+The initialization code should look like this:
+
+```python
+from torch import nn
+from vllm.config import VllmConfig
+from vllm.attention import Attention
+
+class MyAttention(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.attn = Attention(prefix=f"{prefix}.attn")
+
+class MyDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+
+class MyModel(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+        )
+
+class MyModelForCausalLM(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+```
+
+### Computation Code
+
+Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+
+```python
+def forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    kv_caches: List[torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    ...
+```
+
+```{note}
+Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
+```
+
+For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+
+## 3. (Optional) Implement tensor parallelism and quantization support
+
+If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
+To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
+For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
+When it comes to the linear layers, we provide the following options to parallelize them:
+
+- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
+- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
+- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
+- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
+- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
+
+Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
+
+## 4. Implement the weight loading logic
+
+You now need to implement the `load_weights` method in your `*ForCausalLM` class.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
+
+## 5. Register your model
+
+See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
+
+## Frequently Asked Questions
+
+### How to support models with interleaving sliding windows?
+
+For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
+
+To support a model with interleaving sliding windows, we need to take care of the following details:
+
+- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
+- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
+
+With these two steps, interleave sliding windows should work with the model.
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@ -0,0 +1,26 @@
+(new-model)=
+
+# Adding a New Model
+
+This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+
+```{toctree}
+:caption: Contents
+:maxdepth: 1
+
+basic
+registration
+multimodal
+```
+
+```{note}
+The complexity of adding a new model depends heavily on the model's architecture.
+The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+```
+
+```{tip}
+If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
+or ask on our [developer slack](https://slack.vllm.ai).
+We will be happy to help you out!
+```
--- a/docs/source/models/enabling_multimodal_inputs.md
+++ b/docs/source/models/enabling_multimodal_inputs.md
@ -2,15 +2,11 @@

 # Enabling Multimodal Inputs

-This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
-
-```{seealso}
-[Adding a New Model](adding-a-new-model)
-```
+This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).

 ## 1. Update the base vLLM model

-It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
+It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
 Further update the model as follows:

 - Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@ -0,0 +1,56 @@
+(new-model-registration)=
+
+# Model Registration
+
+vLLM relies on a model registry to determine how to run each model.
+A list of pre-registered architectures can be found [here](#supported-models).
+
+If your model is not on this list, you must register it to vLLM.
+This page provides detailed instructions on how to do so.
+
+## Built-in models
+
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
+This gives you the ability to modify the codebase and test your model.
+
+After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
+Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
+Finally, update our [list of supported models](#supported-models) to promote your model!
+
+```{important}
+The list of models in each section should be maintained in alphabetical order.
+```
+
+## Out-of-tree models
+
+You can load an external model using a plugin without modifying the vLLM codebase.
+
+```{seealso}
+[vLLM's Plugin System](#plugin-system)
+```
+
+To register the model, use the following code:
+
+```python
+from vllm import ModelRegistry
+from your_code import YourModelForCausalLM
+ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
+```
+
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+```python
+from vllm import ModelRegistry
+
+ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
+```
+
+```{important}
+If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+Read more about that [here](#enabling-multimodal-inputs).
+```
+
+```{note}
+Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
+```
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@ -34,7 +34,7 @@ pytest tests/
 ```

 ```{note}
-Currently, the repository does not pass the `mypy` tests.
+Currently, the repository is not fully checked by `mypy`.
 ```

 # Contribution Guidelines
--- a/docs/source/contributing/vulnerability_management.md
+++ b/docs/source/contributing/vulnerability_management.md
@ -0,0 +1,43 @@
+# Vulnerability Management
+
+## Reporting Vulnerabilities
+
+As mentioned in the [security
+policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
+vulnerabilities may be reported privately to the project via
+[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
+
+## Vulnerability Management Team
+
+Once a vulnerability has been reported to the project, the Vulnerability
+Management Team (VMT) is responsible for managing the vulnerability. The VMT is
+responsible for:
+
+- Triaging the vulnerability.
+- Coordinating with reporters and project maintainers on vulnerability analysis
+  and resolution.
+- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
+- Coordination with project maintainers on a coordinated release of the fix and
+  security advisory.
+
+### Security Advisories
+
+Advisories are published via GitHub through the same system used to report
+vulnerabilities. More information on the process can be found in the [GitHub
+documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
+
+### Team Members
+
+We prefer to keep all vulnerability-related communication on the security report
+on GitHub. However, if you need to contact the VMT directly for an urgent issue,
+you may contact the following individuals:
+
+- Simon Mo - simon.mo@hey.com
+- Russell Bryant - rbryant@redhat.com
+
+## Slack Discussion
+
+You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+to discuss security-related topics. However, please do not disclose any
+vulnerabilities in this channel. If you need to report a vulnerability, please
+use the GitHub security advisory system or contact a VMT member privately.
--- a/docs/source/serving/deploying_with_docker.md
+++ b/docs/source/serving/deploying_with_docker.md
@ -1,6 +1,6 @@
-(deploying-with-docker)=
+(deployment-docker)=

-# Deploying with Docker
+# Using Docker

 ## Use vLLM's Official Docker Image

--- a/docs/source/serving/deploying_with_bentoml.md
+++ b/docs/source/serving/deploying_with_bentoml.md
@ -1,6 +1,6 @@
-(deploying-with-bentoml)=
+(deployment-bentoml)=

-# Deploying with BentoML
+# BentoML

 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.

--- a/docs/source/serving/deploying_with_cerebrium.md
+++ b/docs/source/serving/deploying_with_cerebrium.md
@ -1,6 +1,6 @@
-(deploying-with-cerebrium)=
+(deployment-cerebrium)=

-# Deploying with Cerebrium
+# Cerebrium

 ```{raw} html
 <p align="center">
@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
 vllm = "latest"
 ```

-Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`:
+Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:

 ```python
 from vllm import LLM, SamplingParams
@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
    return {"results": results}
 ```

-Then, run the following code to deploy it to the cloud
+Then, run the following code to deploy it to the cloud:

 ```console
 $ cerebrium deploy
 ```

-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`)

 ```python
 curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
--- a/docs/source/serving/deploying_with_dstack.md
+++ b/docs/source/serving/deploying_with_dstack.md
@ -1,6 +1,6 @@
-(deploying-with-dstack)=
+(deployment-dstack)=

-# Deploying with dstack
+# dstack

 ```{raw} html
 <p align="center">
@ -25,7 +25,7 @@ $ cd vllm-dstack
 $ dstack init
 ```

-Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
+Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:

 ```yaml
 type: service
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@ -0,0 +1,250 @@
+(deployment-helm)=
+
+# Helm
+
+A Helm chart to deploy vLLM for Kubernetes
+
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
+
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
+
+## Prerequisites
+
+Before you begin, ensure that you have the following:
+
+- A running Kubernetes cluster
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
+- Available GPU resources in your cluster
+- S3 with the model which will be deployed
+
+## Installing the chart
+
+To install the chart with the release name `test-vllm`:
+
+```console
+helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+```
+
+## Uninstalling the Chart
+
+To uninstall the `test-vllm` deployment:
+
+```console
+helm uninstall test-vllm --namespace=ns-vllm
+```
+
+The command removes all the Kubernetes components associated with the
+chart **including persistent volumes** and deletes the release.
+
+## Architecture
+
+```{image} /assets/deployment/architecture_helm_deployment.png
+```
+
+## Values
+
+```{list-table}
+:widths: 25 25 25 25
+:header-rows: 1
+
+* - Key
+  - Type
+  - Default
+  - Description
+* - autoscaling
+  - object
+  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  - Autoscaling configuration
+* - autoscaling.enabled
+  - bool
+  - false
+  - Enable autoscaling
+* - autoscaling.maxReplicas
+  - int
+  - 100
+  - Maximum replicas
+* - autoscaling.minReplicas
+  - int
+  - 1
+  - Minimum replicas
+* - autoscaling.targetCPUUtilizationPercentage
+  - int
+  - 80
+  - Target CPU utilization for autoscaling
+* - configs
+  - object
+  - {}
+  - Configmap
+* - containerPort
+  - int
+  - 8000
+  - Container port
+* - customObjects
+  - list
+  - []
+  - Custom Objects configuration
+* - deploymentStrategy
+  - object
+  - {}
+  - Deployment strategy configuration
+* - externalConfigs
+  - list
+  - []
+  - External configuration
+* - extraContainers
+  - list
+  - []
+  - Additional containers configuration
+* - extraInit
+  - object
+  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  - Additional configuration for the init container
+* - extraInit.pvcStorage
+  - string
+  - "50Gi"
+  - Storage size of the s3
+* - extraInit.s3modelpath
+  - string
+  - "relative_s3_model_path/opt-125m"
+  - Path of the model on the s3 which hosts model weights and config files
+* - extraInit.awsEc2MetadataDisabled
+  - boolean
+  - true
+  - Disables the use of the Amazon EC2 instance metadata service
+* - extraPorts
+  - list
+  - []
+  - Additional ports configuration
+* - gpuModels
+  - list
+  - ["TYPE_GPU_USED"]
+  - Type of gpu used
+* - image
+  - object
+  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  - Image configuration
+* - image.command
+  - list
+  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  - Container launch command
+* - image.repository
+  - string
+  - "vllm/vllm-openai"
+  - Image repository
+* - image.tag
+  - string
+  - "latest"
+  - Image tag
+* - livenessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  - Liveness probe configuration
+* - livenessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+* - livenessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - livenessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - livenessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - livenessProbe.initialDelaySeconds
+  - int
+  - 15
+  - Number of seconds after the container has started before liveness probe is initiated
+* - livenessProbe.periodSeconds
+  - int
+  - 10
+  - How often (in seconds) to perform the liveness probe
+* - maxUnavailablePodDisruptionBudget
+  - string
+  - ""
+  - Disruption Budget Configuration
+* - readinessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  - Readiness probe configuration
+* - readinessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+* - readinessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - readinessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - readinessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - readinessProbe.initialDelaySeconds
+  - int
+  - 5
+  - Number of seconds after the container has started before readiness probe is initiated
+* - readinessProbe.periodSeconds
+  - int
+  - 5
+  - How often (in seconds) to perform the readiness probe
+* - replicaCount
+  - int
+  - 1
+  - Number of replicas
+* - resources
+  - object
+  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  - Resource configuration
+* - resources.limits."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.limits.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.limits.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - resources.requests."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.requests.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.requests.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - secrets
+  - object
+  - {}
+  - Secrets configuration
+* - serviceName
+  - string
+  -
+  - Service name
+* - servicePort
+  - int
+  - 80
+  - Service port
+* - labels.environment
+  - string
+  - test
+  - Environment name
+* - labels.release
+  - string
+  - test
+  - Release name
+```
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@ -0,0 +1,13 @@
+# Using other frameworks
+
+```{toctree}
+:maxdepth: 1
+
+bentoml
+cerebrium
+dstack
+helm
+lws
+skypilot
+triton
+```
--- a/docs/source/serving/deploying_with_lws.md
+++ b/docs/source/serving/deploying_with_lws.md
@ -1,6 +1,6 @@
-(deploying-with-lws)=
+(deployment-lws)=

-# Deploying with LWS
+# LWS

 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@ -1,6 +1,6 @@
-(on-cloud)=
+(deployment-skypilot)=

-# Deploying and scaling up with SkyPilot
+# SkyPilot

 ```{raw} html
 <p align="center">
@ -12,9 +12,9 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet

 ## Prerequisites

- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
 - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
- Check that {code}`sky check` shows clouds or Kubernetes are enabled.
+- Check that `sky check` shows clouds or Kubernetes are enabled.

 ```console
 pip install skypilot-nightly
--- a/docs/source/serving/deploying_with_triton.md
+++ b/docs/source/serving/deploying_with_triton.md
@ -1,5 +1,5 @@
-(deploying-with-triton)=
+(deployment-triton)=

-# Deploying with NVIDIA Triton
+# NVIDIA Triton

 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@ -0,0 +1,9 @@
+# External Integrations
+
+```{toctree}
+:maxdepth: 1
+
+kserve
+kubeai
+llamastack
+```
--- a/docs/source/deployment/integrations/kserve.md
+++ b/docs/source/deployment/integrations/kserve.md
@ -1,6 +1,6 @@
-(deploying-with-kserve)=
+(deployment-kserve)=

-# Deploying with KServe
+# KServe

 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.

--- a/docs/source/deployment/integrations/kubeai.md
+++ b/docs/source/deployment/integrations/kubeai.md
@ -1,6 +1,6 @@
-(deploying-with-kubeai)=
+(deployment-kubeai)=

-# Deploying with KubeAI
+# KubeAI

 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.

--- a/docs/source/deployment/integrations/llamastack.md
+++ b/docs/source/deployment/integrations/llamastack.md
@ -1,6 +1,6 @@
-(run-on-llamastack)=
+(deployment-llamastack)=

-# Serving with Llama Stack
+# Llama Stack

 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .

--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@ -1,6 +1,6 @@
-(deploying-with-k8s)=
+(deployment-k8s)=

-# Deploying with Kubernetes
+# Using Kubernetes

 Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.

@ -43,11 +43,15 @@ metadata:
  name: hf-token-secret
  namespace: default
 type: Opaque
-data:
+stringData:
  token: "REPLACE_WITH_TOKEN"
 ```

-Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
+Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
+
+Here are two examples for using NVIDIA GPU and AMD GPU. 
+
+- NVIDIA GPU

 ```yaml
 apiVersion: apps/v1
@ -119,6 +123,79 @@ spec:
          periodSeconds: 5
 ```

+- AMD GPU
+
+You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+  namespace: default
+  labels:
+    app: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      volumes:
+      # PVC
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: mistral-7b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "8Gi"
+      hostNetwork: true
+      hostIPC: true
+      containers:
+      - name: mistral-7b
+        image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+        securityContext:
+          seccompProfile:
+            type: Unconfined
+          runAsGroup: 44
+          capabilities:
+            add:
+            - SYS_PTRACE
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            amd.com/gpu: "1"
+          requests:
+            cpu: "6"
+            memory: 6G
+            amd.com/gpu: "1"
+        volumeMounts:
+        - name: cache-volume
+          mountPath: /root/.cache/huggingface
+        - name: shm
+          mountPath: /dev/shm
+```
+You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
+
 2. **Create a Kubernetes Service for vLLM**

 Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
--- a/docs/source/serving/deploying_with_nginx.md
+++ b/docs/source/serving/deploying_with_nginx.md
@ -1,6 +1,6 @@
 (nginxloadbalancer)=

-# Deploying with Nginx Loadbalancer
+# Using Nginx

 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.

--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@ -57,7 +57,7 @@ More API details can be found in the {doc}`Offline Inference

 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.

-### OpenAI-compatible API server
+### OpenAI-Compatible API Server

 The second primary interface to vLLM is via its OpenAI-compatible API server.
 This server can be started using the `vllm serve` command.
@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>

 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.

-More details on the API server can be found in the {doc}`OpenAI Compatible
-Server </serving/openai_compatible_server>` document.
+More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.

 ## LLM Engine

--- a/docs/source/automatic_prefix_caching/details.md
+++ b/docs/source/automatic_prefix_caching/details.md
@ -1,6 +1,8 @@
-# Implementation
+(design-automatic-prefix-caching)=

-The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+# Automatic Prefix Caching
+
+The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.

 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.

--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@ -1,3 +1,5 @@
+(design-paged-attention)=
+
 # vLLM Paged Attention

 - Currently, vLLM utilizes its own implementation of a multi-head query
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/design/multimodal/multimodal_index.md
@ -45,31 +45,23 @@ adding_multimodal_plugin
 ### Base Classes

 ```{eval-rst}
-.. autodata:: vllm.multimodal.NestedTensors
-```
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.BatchedTensorInputs
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
+.. automodule:: vllm.multimodal.base
    :members:
    :show-inheritance:
 ```

-```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
-```
+### Input Classes

 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalKwargs
+.. automodule:: vllm.multimodal.inputs
    :members:
    :show-inheritance:
 ```

+### Audio Classes
+
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalPlugin
+.. automodule:: vllm.multimodal.audio
    :members:
    :show-inheritance:
 ```
@ -81,3 +73,11 @@ adding_multimodal_plugin
    :members:
    :show-inheritance:
 ```
+
+### Video Classes
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.video
+    :members:
+    :show-inheritance:
+```
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@ -2,7 +2,7 @@

 ## Debugging

-Please see the [Debugging Tips](#debugging-python-multiprocessing)
+Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
 page for information on known issues and how to solve them.

 ## Introduction
--- a/docs/source/design/plugin_system.md
+++ b/docs/source/design/plugin_system.md
@ -41,9 +41,11 @@ Every plugin has three parts:
 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
 3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.

-## What Can Plugins Do?
+## Types of supported plugins

-Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
+
+- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.

 ## Guidelines for Writing Plugins

--- a/docs/source/dev/offline_inference/offline_index.md
+++ b/docs/source/dev/offline_inference/offline_index.md
@ -1,6 +1,7 @@
 # Offline Inference

 ```{toctree}
+:caption: Contents
 :maxdepth: 1

 llm
--- a/docs/source/features/automatic_prefix_caching.md
+++ b/docs/source/features/automatic_prefix_caching.md
@ -1,13 +1,13 @@
-(apc)=
+(automatic-prefix-caching)=

-# Introduction
+# Automatic Prefix Caching

-## What is Automatic Prefix Caching
+## Introduction

 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.

 ```{note}
-Technical details on how vLLM implements APC are in the next page.
+Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
 ```

 ## Enabling APC in vLLM
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@ -32,7 +32,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar

   * - Feature
     - [CP](#chunked-prefill)
-     - [APC](#apc)
+     - [APC](#automatic-prefix-caching)
     - [LoRA](#lora-adapter)
     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
     - [SD](#spec_decode)
@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
     -
     -
     -
-   * - [APC](#apc)
+   * - [APC](#automatic-prefix-caching)
     - ✅
     -
     -
@ -345,7 +345,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
     - ✅
     - ✅
     - ✅
-   * - [APC](#apc)
+   * - [APC](#automatic-prefix-caching)
     - [✗](gh-issue:3687)
     - ✅
     - ✅
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@ -1,8 +1,12 @@
 (disagg-prefill)=

-# Disaggregated prefilling (experimental)
+# Disaggregated Prefilling (experimental)

-This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
+This page introduces you the disaggregated prefilling feature in vLLM.
+
+```{note}
+This feature is experimental and subject to change.
+```

 ## Why disaggregated prefilling?

@ -41,13 +45,13 @@ Key abstractions for disaggregated prefilling:

 Here is a figure illustrating how the above 3 abstractions are organized:

-```{image} /assets/usage/disagg_prefill/abstraction.jpg
+```{image} /assets/features/disagg_prefill/abstraction.jpg
 :alt: Disaggregated prefilling abstractions
 ```

 The workflow of disaggregated prefilling is as follows:

-```{image} /assets/usage/disagg_prefill/overview.jpg
+```{image} /assets/features/disagg_prefill/overview.jpg
 :alt: Disaggregated prefilling workflow
 ```

--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -37,3 +37,10 @@ model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
 quantization="bitsandbytes", load_format="bitsandbytes")
 ```
+## OpenAI Compatible Server
+
+Append the following to your 4bit model arguments:
+
+```
+--quantization bitsandbytes --load-format bitsandbytes
+```
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
--- a/docs/source/features/quantization/fp8_e5m2_kvcache.md
+++ b/docs/source/features/quantization/fp8_e5m2_kvcache.md
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@ -0,0 +1,19 @@
+(quantization-index)=
+
+# Quantization
+
+Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
+
+```{toctree}
+:caption: Contents
+:maxdepth: 1
+
+supported_hardware
+auto_awq
+bnb
+gguf
+int8
+fp8
+fp8_e5m2_kvcache
+fp8_e4m3_kvcache
+```
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@ -0,0 +1,131 @@
+(quantization-supported-hardware)=
+
+# Supported Hardware
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 8 8 8 8 8 8 8 8 8 8
+
+* - Implementation
+  - Volta
+  - Turing
+  - Ampere
+  - Ada
+  - Hopper
+  - AMD GPU
+  - Intel GPU
+  - x86 CPU
+  - AWS Inferentia
+  - Google TPU
+* - AWQ
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - GPTQ
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - Marlin (GPTQ/AWQ/FP8)
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - INT8 (W8A8)
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✅︎
+  - ✗
+  - ✗
+* - FP8 (W8A8)
+  - ✗
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - AQLM
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - bitsandbytes
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - DeepSpeedFP
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - GGUF
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+```
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- "✅︎" indicates that the quantization method is supported on the specified hardware.
+- "✗" indicates that the quantization method is not supported on the specified hardware.
+
+```{note}
+This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
+```
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -1,6 +1,6 @@
 (spec-decode)=

-# Speculative decoding
+# Speculative Decoding

 ```{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
@ -159,6 +159,72 @@ A variety of speculative models of this type are available on HF hub:
 - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
 - [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)

+## Speculating using EAGLE based draft models
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    tensor_parallel_size=4,
+    speculative_model="path/to/modified/eagle/model",
+    speculative_draft_tensor_parallel_size=1,
+)
+
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+```
+
+A few important things to consider when using the EAGLE based draft models:
+
+1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
+   used directly with vLLM due to differences in the expected layer names and model definition.
+   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) 
+   to convert them. Note that this script does not modify the model's weights.
+
+   In the above example, use the script to first convert
+   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model 
+   and then use the converted checkpoint as the draft model in vLLM.
+
+2. The EAGLE based draft models need to be run without tensor parallelism
+   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   it is possible to run the main model using tensor parallelism (see example above).
+
+3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
+   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
+   investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
+
+
+A variety of EAGLE draft models are available on the Hugging Face hub:
+
+| Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
+|---------------------------------------------------------------------|-------------------------------------------|--------------------|
+| Vicuna-7B-v1.3                                                       | yuhuili/EAGLE-Vicuna-7B-v1.3             | 0.24B              |
+| Vicuna-13B-v1.3                                                      | yuhuili/EAGLE-Vicuna-13B-v1.3            | 0.37B              |
+| Vicuna-33B-v1.3                                                      | yuhuili/EAGLE-Vicuna-33B-v1.3            | 0.56B              |
+| LLaMA2-Chat 7B                                                       | yuhuili/EAGLE-llama2-chat-7B             | 0.24B              |
+| LLaMA2-Chat 13B                                                      | yuhuili/EAGLE-llama2-chat-13B            | 0.37B              |
+| LLaMA2-Chat 70B                                                      | yuhuili/EAGLE-llama2-chat-70B            | 0.99B              |
+| Mixtral-8x7B-Instruct-v0.1                                           | yuhuili/EAGLE-mixtral-instruct-8x7B      | 0.28B              |
+| LLaMA3-Instruct 8B                                                   | yuhuili/EAGLE-LLaMA3-Instruct-8B         | 0.25B              |
+| LLaMA3-Instruct 70B                                                  | yuhuili/EAGLE-LLaMA3-Instruct-70B        | 0.99B              |
+| Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
+| Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
+
+
 ## Lossless guarantees of Speculative Decoding

 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
@ -182,7 +248,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
   \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
   same request across runs. For more details, see the FAQ section
-   titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).

 **Conclusion**

@ -195,7 +261,7 @@ can occur due to following factors:

 **Mitigation Strategies**

-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).

 ## Resources for vLLM contributors

--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@ -2,7 +2,7 @@

 # Structured Outputs

-vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding.
+vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.

 ## Online Inference (OpenAI API)
@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
 - `guided_decoding_backend`: used to select the guided decoding backend to use.

-You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page.
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.

 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:

--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8
 vllm serve meta-llama/Llama-3.1-8B-Instruct \
    --enable-auto-tool-choice \
    --tool-call-parser llama3_json \
-    --chat-template examples/tool_chat_template_llama3_json.jinja
+    --chat-template examples/tool_chat_template_llama3.1_json.jinja
 ```

 Next, make a request to the model that should result in it using the available tools:
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@ -2,7 +2,7 @@

 # Installation for ARM CPUs

-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:

 - CPU backend inference capabilities
 - Relevant runtime environment variables
@ -20,7 +20,7 @@ Contents:
 ## Requirements

 - **Operating System**: Linux or macOS
- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
 - **Instruction Set Architecture (ISA)**: NEON support is required

 (arm-backend-quick-start-dockerfile)=
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@ -1,6 +1,6 @@
-(installation-cpu)=
+(installation-x86)=

-# Installation with CPU
+# Installation for x86 CPUs

 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:

@ -24,7 +24,7 @@ Table of contents:
 ## Requirements

 - OS: Linux
- Compiler: gcc/g++>=12.3.0 (optional, recommended)
+- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
 - Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)

 (cpu-backend-quick-start-dockerfile)=
@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install

 ```{note}
 - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
 ```

 (env-intro)=
@ -151,4 +151,4 @@ $ python examples/offline_inference.py
    $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
    ```

-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@ -1,8 +1,8 @@
-(installation)=
+(installation-cuda)=

-# Installation
+# Installation for CUDA

-vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
+vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.4) binaries.

 ## Requirements

@ -12,24 +12,43 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin

 ## Install released versions

-You can install vLLM using pip:
+### Create a new Python environment
+
+You can create a new Python environment using `conda`:

 ```console
 $ # (Recommended) Create a new conda environment.
 $ conda create -n myenv python=3.12 -y
 $ conda activate myenv
-
-$ # Install vLLM with CUDA 12.1.
-$ pip install vllm
 ```

 ```{note}
-Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
 ```

-````{note}
-As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
-We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
+
+```console
+$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+```
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
+
+### Install vLLM
+
+You can install vLLM using either `pip` or `uv pip`:
+
+```console
+$ # Install vLLM with CUDA 12.4.
+$ pip install vllm # If you are using pip.
+$ uv pip install vllm # If you are using uv.
+```
+
+As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:

 ```console
 $ # Install vLLM with CUDA 11.8.
@ -38,29 +57,47 @@ $ export PYTHON_VERSION=310
 $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```

-In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-
-Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
-````
-
 (install-the-latest-code)=

 ## Install the latest code

-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+
+### Install the latest code using `pip`

 ```console
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```

-If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
+`--pre` is required for `pip` to consider pre-released versions.
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:

 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
 $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```

-Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
+### Install the latest code using `uv`
+
+Another way to install the latest code is to use `uv`:
+
+```console
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```console
+$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+```
+
+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+### Install the latest code using `docker`

 Another way to access the latest code is to use the docker images:

@ -89,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```

-This will download the latest nightly wheel and use the compiled libraries from there in the install.
+This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.

 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):

--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@ -1,6 +1,6 @@
 (installation-rocm)=

-# Installation with ROCm
+# Installation for ROCm

 vLLM supports AMD GPUs with ROCm 6.2.

@ -148,7 +148,7 @@ $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 $ python3 setup.py develop
 ```

-This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
+This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.

 ```{tip}
 - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@ -1,4 +1,6 @@
-# Installation with Intel® Gaudi® AI Accelerators
+(installation-gaudi)=
+
+# Installation for Intel® Gaudi®

 This README provides instructions on running vLLM with Intel Gaudi devices.

@ -80,7 +82,7 @@ $ python setup.py develop

 ## Supported Features

- [Offline batched inference](#offline-batched-inference)
+- [Offline inference](#offline-inference)
 - Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
@ -141,32 +143,33 @@ Gaudi2 devices. Configurations that are not listed may or may not work.

 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.

-```{eval-rst}
-.. list-table:: vLLM execution modes
-   :widths: 25 25 50
-   :header-rows: 1
+```{list-table} vLLM execution modes
+:widths: 25 25 50
+:header-rows: 1

-   * - ``PT_HPU_LAZY_MODE``
-     - ``enforce_eager``
-     - execution mode
-   * - 0
-     - 0
-     - torch.compile
-   * - 0
-     - 1
-     - PyTorch eager mode
-   * - 1
-     - 0
-     - HPU Graphs
-   * - 1
-     - 1
-     - PyTorch lazy mode
+* - `PT_HPU_LAZY_MODE`
+  - `enforce_eager`
+  - execution mode
+* - 0
+  - 0
+  - torch.compile
+* - 0
+  - 1
+  - PyTorch eager mode
+* - 1
+  - 0
+  - HPU Graphs
+* - 1
+  - 1
+  - PyTorch lazy mode
 ```

 ```{warning}
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
 ```

+(gaudi-bucketing-mechanism)=
+
 ### Bucketing mechanism

 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
@ -185,7 +188,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
 INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```

-`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.

 Example (with ramp-up)

@ -214,7 +217,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.

 ```{note}
-Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 ```

 ### Warmup
@ -235,7 +238,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
 INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```

-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.

 ```{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michael Goin	c1d1875ba3	Updates docs with correction about default cuda version Correct 12.1 --> 12.4	2025-01-07 17:29:07 -05:00
sroy745	973f5dc581	[Doc]Add documentation for using EAGLE in vLLM (#11417 ) Signed-off-by: Sourashis Roy <sroy@roblox.com>	2025-01-07 19:19:12 +00:00
jiangjiadi	c994223d56	[Bugfix] update the prefix for qwen2 (#11795 ) Co-authored-by: jiadi.jjd <jiadi.jjd@antgroup.com>	2025-01-07 18:36:34 +00:00
youkaichao	869579a702	[optimization] remove python function call for custom op (#11750 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-01-07 17:04:28 +00:00
Cyrus Leung	c0efe92d8b	[Doc] Add note to `gte-Qwen2` models (#11808 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-07 21:50:58 +08:00
youkaichao	d9fa1c05ad	[doc] update how pip can install nightly wheels (#11806 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-01-07 21:42:58 +08:00
Roger Wang	2de197bdd4	[V1] Support audio language models on V1 (#11733 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-01-07 19:47:36 +08:00
youkaichao	869e829b85	[doc] add doc to explain how to use uv (#11773 ) Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-01-07 18:41:17 +08:00
Cyrus Leung	8f37be38eb	[Bugfix] Comprehensively test and fix LLaVA-NeXT feature size calculation (#11800 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-07 18:25:02 +08:00
Roger Wang	8082ad7950	[V1][Doc] Update V1 support for `LLaVa-NeXT-Video` (#11798 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-01-07 09:55:39 +00:00
Yuan	1e4ce295ae	[CI][CPU] adding build number to docker image name (#11788 ) Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>	2025-01-07 07:28:01 +00:00
Russell Bryant	ce1917fcf2	[Doc] Create a vulnerability management team (#9925 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-01-06 22:57:32 -08:00
XiaobingZhang	e512f76a89	fix init error for MessageQueue when n_local_reader is zero (#11768 )	2025-01-07 06:12:48 +00:00
Liangfu Chen	898cdf033e	[CI] Fix neuron CI and run offline tests (#11779 ) Signed-off-by: Liangfu Chen <liangfc@amazon.com>	2025-01-06 21:36:10 -08:00
Roger Wang	0f3f3c86ec	[Bugfix] Update attention interface in `Whisper` (#11784 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-01-07 04:36:24 +00:00
Jee Jee Li	b278557935	[Kernel][LoRA]Punica prefill kernels fusion (#11234 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: Abatom <abzhonghua@gmail.com> Co-authored-by: Zhonghua Deng <abatom@163.com>	2025-01-07 04:01:39 +00:00
Cyrus Leung	8ceffbf315	[Doc][3/N] Reorganize Serving section (#11766 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-07 11:20:01 +08:00
YiSheng5	d93d2d74fd	[XPU] Make pp group initilized for pipeline-parallelism (#11648 ) Signed-off-by: yisheng <yi.sheng@intel.com>	2025-01-07 11:09:58 +08:00
Cyrus Leung	d0169e1b0f	[Model] Future-proof Qwen2-Audio multi-modal processor (#11776 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-07 11:05:17 +08:00
Cyrus Leung	08fb75c72e	[Bugfix] Fix LLaVA-NeXT feature size precision error (for real) (#11772 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-07 01:10:54 +00:00
Roger Wang	91b361ae89	[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685 ) Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-06 19:58:16 +00:00
Chen Zhang	e20c92bb61	[Kernel] Move attn_type to Attention.__init__() (#11690 ) Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2025-01-07 00:11:28 +08:00
Jee Jee Li	32c9eff2ff	[Bugfix][V1] Fix molmo text-only inputs (#11676 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-01-06 15:22:25 +00:00
youkaichao	4ca5d40adc	[doc] explain how to add interleaving sliding window support (#11771 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-01-06 21:57:44 +08:00
Roger Wang	9279b9f83d	[Bugfix] Fix max image size for LLaVA-Onevision (#11769 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-01-06 13:48:53 +00:00
Cyrus Leung	ee77fdb5de	[Doc][2/N] Reorganize Models and Usage sections (#11755 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-06 21:40:31 +08:00
Cyrus Leung	996357e480	[VLM] Separate out profiling-related logic (#11746 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-06 16:02:21 +08:00
Suraj Deshmukh	2a622d704a	k8s-config: Update the secret to use stringData (#11679 ) Signed-off-by: Suraj Deshmukh <surajd.service@gmail.com>	2025-01-06 08:01:22 +00:00
Lucas Tucker	9c749713f6	[mypy] Forward pass function type hints in lora (#11740 ) Signed-off-by: lucast2021 <lucast2021@headroyce.org> Co-authored-by: lucast2021 <lucast2021@headroyce.org>	2025-01-06 07:59:36 +00:00
Rui Qiao	022c5c6944	[V1] Refactor get_executor_cls (#11754 )	2025-01-06 07:59:16 +00:00
Rui Qiao	f8fcca100b	[Misc] Fix typo for valid_tool_parses (#11753 ) Signed-off-by: Rui Qiao <ruisearch42@gmail.com>	2025-01-06 07:12:38 +00:00
Woosuk Kwon	06bfb51963	[V1] Add BlockTable class (#11693 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-06 14:24:42 +09:00
Cody Yu	408e560015	[Bugfix] Remove block size constraint (#11723 )	2025-01-06 12:49:55 +08:00
Cyrus Leung	402d378360	[Doc] [1/N] Reorganize Getting Started section (#11645 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-06 02:18:33 +00:00
cennn	9e764e7b10	[distributed] remove pynccl's redundant change_state (#11749 )	2025-01-06 09:05:48 +08:00
Robert Shaw	33fc1e2e86	[Frontend] Improve `StreamingResponse` Exception Handling (#11752 )	2025-01-05 16:35:01 -05:00
Lancer	eba17173d3	fix: [doc] fix typo (#11751 ) Co-authored-by: Lancer <maruixiang6688@gmail.com>	2025-01-06 00:48:16 +08:00
cennn	635b897246	[distributed] remove pynccl's redundant stream (#11744 )	2025-01-05 23:09:11 +08:00
Lu Fang	4068f4b5b5	[MISC] Replace c10::optional with std::optional (#11730 ) Signed-off-by: Lu Fang <lufang@fb.com>	2025-01-05 10:20:34 +09:00
Jee Jee Li	47831430cc	[Bugfix][V1] Fix test_kv_cache_utils.py (#11738 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-01-04 16:07:59 +00:00
Cyrus Leung	65c08928c2	[Model] Remove unnecessary weight initialization logic (#11736 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>	2025-01-04 23:46:21 +08:00
Cyrus Leung	ba214dffbe	[Bugfix] Fix precision error in LLaVA-NeXT (#11735 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-04 23:45:57 +08:00
Cyrus Leung	eed11ebee9	[VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-04 11:40:53 +00:00
Yan Burman	300acb8347	[Core][Bugfix] Use correct device to initialize GPU data during CUDA-graph-capture (#11233 ) Signed-off-by: Yan Burman <yanburman@users.noreply.github.com> Signed-off-by: Ido Asraff <idoa@atero.ai>	2025-01-04 14:50:16 +08:00
xcnick	d91457d529	[V1] Add kv cache utils tests. (#11513 ) Signed-off-by: xcnick <xcnick0412@gmail.com>	2025-01-04 14:49:46 +08:00
Kunshang Ji	fbf2564554	[V1] Add `RayExecutor` support for `AsyncLLM` (api server) (#11712 )	2025-01-04 06:41:31 +00:00
Alberto Ferrer	d1d49397e7	Update bnb.md with example for OpenAI (#11718 )	2025-01-04 06:29:02 +00:00
Hust_YangXian	9c93636d84	Update tool_calling.md (#11701 )	2025-01-04 06:16:30 +00:00
WangErXiao	e5d7ed0c53	[V1] log GPU blocks num for MultiprocExecutor (#11656 )	2025-01-04 00:13:12 +00:00
Robert Shaw	ad0d567e1c	[V1] Chore: cruft removal (#11724 )	2025-01-03 23:25:02 +00:00
Michael Goin	bf0d97d786	Update requirements-tpu.txt to support python 3.9 and 3.11 (#11695 ) Signed-off-by: mgoin <michael@neuralmagic.com>	2025-01-03 22:36:46 +00:00
Jee Jee Li	a655eb3025	[Misc]Add BNB quantization for Qwen2VL (#11719 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>	2025-01-03 15:19:02 -07:00
Robert Shaw	1543914c04	[V1] Improve TP>1 Error Handling + Stack Trace (#11721 ) Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-01-03 21:29:11 +00:00
ZincCat	61fed92c7e	[Bugfix] Fix ColumnParallelLinearWithLoRA slice (#11708 ) Signed-off-by: ZincCat <zincchloride@outlook.com>	2025-01-03 21:02:34 +00:00
Robert Shaw	80c751e7f6	[V1] Simplify Shutdown (#11659 )	2025-01-03 17:25:38 +00:00
Aurick Qiao	e1a5c2f0a1	[Model] Whisper model implementation (#11280 ) Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>	2025-01-03 16:39:19 +08:00
Kevin H. Luu	fd3a62a122	[perf-benchmark] Fix dependency for steps in benchmark pipeline (#11710 )	2025-01-02 22:38:37 -08:00
Lu Fang	07064cb1d4	[Bugfix] Check chain_speculative_sampling before calling it (#11673 ) Signed-off-by: Lu Fang <lufang@fb.com>	2025-01-02 16:58:56 -08:00
Sachin Varghese	2f1e8e8f54	Update default max_num_batch_tokens for chunked prefill (#11694 )	2025-01-03 00:25:53 +00:00
Nathan Azrak	68d37809b9	[Misc] Minimum requirements for SageMaker compatibility (#11576 )	2025-01-02 15:59:25 -08:00
wchen61	5dba257506	Resolve race conditions in Marlin kernel (#11493 ) Signed-off-by: wchen61 <wchen61@foxmail.com>	2025-01-02 22:58:56 +00:00
bjmsong	187e32997c	[Bugfix] Change kv scaling factor by param json on nvidia gpu (#11688 ) Signed-off-by: bjmsong <bjmsong@126.com> Co-authored-by: bjmsong <bjmsong@126.com>	2025-01-02 21:11:39 +00:00
Woosuk Kwon	b55ed6ef8a	[V1][Minor] Optimize token_ids_cpu copy (#11692 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-02 12:04:58 -07:00
Kathy Yu	2f385183f3	[Bugfix] Free cross attention block table for preempted-for-recompute sequence group. (#10013 ) Signed-off-by: Kathy Yu <feiyangyu@google.com>	2025-01-02 10:28:09 -08:00
Chunyang Wen	84c35c374a	According to vllm.EngineArgs, the name should be distributed_executor_backend (#11689 )	2025-01-02 18:14:16 +00:00
Cyrus Leung	8c38ee7007	[VLM] Merged multi-modal processor for LLaVA-NeXT (#11682 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-02 16:39:27 +00:00
Tobias Pitters	b6087a6bee	[mypy] Pass type checking in vllm/inputs (#11680 ) Signed-off-by: Tobias Pitters <tobias.pitters@gmail.com>	2025-01-02 16:18:15 +00:00
Cyrus Leung	23c1b10a4c	[VLM][Bugfix] Multi-modal processor compatible with V1 multi-input (#11674 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-02 17:00:00 +08:00
Cyrus Leung	a115ac46b5	[VLM] Move supported limits and max tokens to merged multi-modal processor (#11669 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>	2025-01-01 15:44:42 +00:00
Woosuk Kwon	73001445fb	[V1] Implement Cascade Attention (#11635 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-01 21:56:46 +09:00
Kazuhiro Serizawa	6d70198b17	[Doc] Fix typo (#11666 ) Signed-off-by: Kazuhiro Serizawa <nserihiro@gmail.com>	2025-01-01 08:10:10 +00:00
Lu Fang	f962f426bc	[Misc] Replace space with - in the file names (#11667 ) Signed-off-by: Lu Fang <lufang@fb.com>	2025-01-01 07:39:30 +00:00
Jee Jee Li	11d8a091c6	[Misc] Optimize Qwen2-VL LoRA test (#11663 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-01-01 14:42:23 +08:00
Cyrus Leung	365801fedd	[VLM] Add max-count checking in data parser for single image models (#11661 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2024-12-31 22:15:21 -08:00
Joe Runde	4db72e57f6	[Bugfix][Refactor] Unify model management in frontend (#11660 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2025-01-01 02:21:51 +00:00
Yihua Cheng	0c6f998554	[Benchmark] Add benchmark script for CPU offloading (#11533 ) Signed-off-by: ApostaC <yihua98@uchicago.edu> Co-authored-by: KuntaiDu <kuntai@uchicago.edu>	2025-01-01 00:10:55 +00:00
Roger Wang	e7c7c5e822	[V1][VLM] V1 support for selected single-image models. (#11632 ) Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Isotr0py <2037008807@qq.com>	2024-12-31 21:17:22 +00:00
Chen Zhang	8c3230d8c1	[V1] Simpify vision block hash for prefix caching by removing offset from hash (#11646 )	2024-12-31 08:56:01 +00:00
sakunkun	2c5718809b	[Bugfix] Move the _touch(computed_blocks) call in the allocate_slots method to after the check for allocating new blocks. (#11565 )	2024-12-31 06:29:04 +00:00
John Giorgi	82c49d3260	[Misc][LoRA] Support Rank Stabilized LoRA (RSLoRA) (#6909 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	2024-12-30 22:15:58 -08:00
Michael Goin	74fa1d123c	[Bugfix] Fix OpenAI parallel sampling when using xgrammar (#11637 ) Signed-off-by: mgoin <michael@neuralmagic.com>	2024-12-31 03:43:54 +00:00
Matthias Vogler	a2a40bcd0d	[Model][LoRA]LoRA support added for MolmoForCausalLM (#11439 ) Signed-off-by: Matthias Vogler <matthias.vogler@joesecurity.org> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Matthias Vogler <matthias.vogler@joesecurity.org> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	2024-12-30 17:33:06 -08:00
Kevin H. Luu	ccb1aabcca	[benchmark] Remove dependency for H100 benchmark step (#11572 )	2024-12-30 12:27:07 -08:00
whyiug	36e7670045	[Bugfix] Validate and concatenate image embeddings in MiniCPMVBaseModel (#11631 )	2024-12-30 18:51:04 +00:00
Robert Shaw	5886aa496e	[V1] [6/N] API Server: Better Shutdown (#11586 )	2024-12-30 15:51:02 +00:00
Cyrus Leung	8d9b6721e7	[VLM] Abstract out multi-modal data parsing in merged processor (#11620 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-12-30 15:01:35 +00:00
youkaichao	b12e87f942	[platforms] enable platform plugins (#11602 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2024-12-30 20:24:45 +08:00
Li, Jiang	5dbf854553	[CI/Build][CPU] Fix CPU CI by lazy importing triton FP8 kernels (#11618 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2024-12-30 10:17:04 +00:00
Tyler Michael Smith	970d6d0776	[Build][Kernel] Update CUTLASS to v3.6.0 (#11607 ) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2024-12-30 17:22:13 +08:00
Liangfu Chen	628ec6c17b	[Docker] bump up neuron sdk v2.21 (#11593 ) Signed-off-by: Liangfu Chen <liangfc@amazon.com>	2024-12-30 13:46:14 +08:00
youkaichao	3682e33f9f	[v1] fix compilation cache (#11598 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2024-12-30 04:24:12 +00:00
Michael Goin	0aa38d16f5	Remove print statement in DeepseekScalingRotaryEmbedding (#11604 )	2024-12-29 20:16:46 +00:00
Kuntai Du	faef77c0d6	[Misc] KV cache transfer connector registry (#11481 ) Signed-off-by: KuntaiDu <kuntai@uchicago.edu>	2024-12-29 16:08:09 +00:00
youkaichao	dba4d9dec6	[v1][bugfix] fix cudagraph with inplace buffer assignment (#11596 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2024-12-29 09:03:49 +00:00
Cyrus Leung	32b4c63f02	[Doc] Convert list tables to MyST (#11594 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-12-29 15:56:22 +08:00
Robert Shaw	4fb8e329fd	[V1] [5/N] API Server: unify `Detokenizer` and `EngineCore` input (#11545 ) Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>	2024-12-28 20:51:57 +00:00
youkaichao	328841d002	[bugfix] interleaving sliding window for cohere2 model (#11583 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2024-12-28 16:55:42 +00:00
Cyrus Leung	d427e5cfda	[Doc] Minor documentation fixes (#11580 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-12-28 21:53:59 +08:00
Woosuk Kwon	42bb201fd6	[V1][Minor] Set pin_memory=False for token_ids_cpu tensor (#11581 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2024-12-28 13:33:12 +00:00
hj-wei	59d6bb4c86	[Hardware][AMD]: Replace HIPCC version with more precise ROCm version (#11515 ) Signed-off-by: hjwei <hjwei_xd@163.com>	2024-12-28 11:17:35 +00:00
Roger Wang	b7dcc003dc	[Model] Remove hardcoded image tokens ids from Pixtral (#11582 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2024-12-28 10:54:23 +00:00
Isotr0py	d34be24bb1	[Model] Support InternLM2 Reward models (#11571 ) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2024-12-28 06:14:10 +00:00
Rajveer Bachkaniwala	b5cbe8eeb3	[Bugfix] Last token measurement fix (#11376 ) Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>	2024-12-28 11:34:46 +08:00
Robert Shaw	df04dffade	[V1] [4/N] API Server: ZMQ/MP Utilities (#11541 )	2024-12-28 01:45:08 +00:00
Chen Zhang	a60731247f	[Doc] Update mllama example based on official doc (#11567 ) Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2024-12-28 00:31:10 +00:00
Selali	ac79799403	[Bugfix] Fix for ROCM compressed tensor support (#11561 )	2024-12-27 20:12:11 +00:00
Isotr0py	dde1fa18c9	[Misc] Improve BNB loader to handle mixture of sharded and merged weights with same suffix (#11566 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2024-12-27 19:45:13 +00:00
Jee Jee Li	0240402c46	[Misc]Add BNB quantization for MolmoForCausalLM (#11551 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2024-12-27 18:48:24 +00:00
ErezSC42	55509c2114	[MODEL] LoRA support for Jamba model (#11209 ) Signed-off-by: Erez Schwartz <erezs@ai21.com>	2024-12-27 17:58:21 +00:00
Cyrus Leung	101418096f	[VLM] Support caching in merged multi-modal processor (#11396 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-12-27 17:22:48 +00:00
Chen1022	5ce4627a7e	[Doc] Add xgrammar in doc (#11549 ) Signed-off-by: ccjincong <chenjincong11@gmail.com>	2024-12-27 13:05:10 +00:00
Cyrus Leung	7af553ea30	[Misc] Abstract the logic for reading and writing media content (#11527 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-12-27 19:21:23 +08:00
Jee Jee Li	2c9b8ea2b0	[Bugfix] Fix TeleChat2ForCausalLM weights mapper (#11546 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2024-12-27 10:39:15 +00:00
AlexHe99	d003f3ea39	Update deploying_with_k8s.md with AMD ROCm GPU example (#11465 ) Signed-off-by: Alex He <alehe@amd.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2024-12-27 10:00:04 +00:00
Mengqing Cao	6c6f7fe8a8	[Platform] Move model arch check to platform (#11503 ) Signed-off-by: Mengqing Cao <cmq0113@163.com>	2024-12-27 08:45:25 +00:00