fix TP

add cuda graph support to triton_mla attention
review comments
2025-10-21 23:48:57 +08:00 · 2025-01-31 01:18:56 +00:00 · 2025-01-30 21:12:00 +00:00 · 2025-01-30 15:11:58 +00:00 · 2025-01-30 14:09:46 +00:00 · 2025-01-30 08:51:42 -05:00
305 changed files with 15866 additions and 6459 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -2,8 +2,11 @@ import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
+# Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
 VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
 def print_top_10_largest_files(zip_file):
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -183,7 +183,16 @@ steps:
    - vllm/
    - tests/v1
  commands:
-    - VLLM_USE_V1=1 pytest -v -s v1
+    # split the test to avoid interference
    - VLLM_USE_V1=1 pytest -v -s v1/core
    - VLLM_USE_V1=1 pytest -v -s v1/engine
    - VLLM_USE_V1=1 pytest -v -s v1/sample
    - VLLM_USE_V1=1 pytest -v -s v1/worker
    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - VLLM_USE_V1=1 pytest -v -s v1/e2e
 - label: Examples Test # 25min
  working_dir: "/vllm-workspace/examples"
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,18 +3,18 @@ default_stages:
  - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.32.0
+  rev: v0.43.0
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.9.3
  hooks:
  - id: ruff
    args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.0
  hooks:
  - id: codespell
    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
@ -23,7 +23,7 @@ repos:
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v18.1.5
+  rev: v19.1.7
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@ -35,7 +35,7 @@ repos:
  - id: pymarkdown
    files: docs/.*
 - repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
+  rev: v1.7.7
  hooks:
  - id: actionlint
 - repo: local
@ -89,4 +89,5 @@ repos:
    name: Suggestion
    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
    language: system
-    verbose: true
+    verbose: true
    pass_filenames: false
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -275,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
@ -296,8 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
+  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
    set_gencode_flags_for_srcs(
@ -351,7 +351,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # 2:4 Sparse Kernels
  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
@ -576,7 +576,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 90eacc1af2a7c3de62ea249e929ed5faccf38954
+          GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/27
+++ b/27
@ -126,8 +126,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-# Default max size of the wheel is 250MB
+# sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=250
+ARG VLLM_MAX_SIZE_MB=300
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install dist/*.whl --verbose
 # How to build this FlashInfer wheel:
 # $ export FLASHINFER_ENABLE_AOT=1
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
 # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
 # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
 # $ cd flashinfer
 # $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
 # $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+    python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
 fi
 COPY examples examples
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-build.txt
 #################### vLLM installation IMAGE ####################
 #################### TEST IMAGE ####################
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250122"
+ARG NIGHTLY_DATE="20250124"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 FROM $BASE_IMAGE
--- a/README.md
+++ b/README.md
@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 *Latest News* 🔥
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -51,7 +51,8 @@ async def async_request_tgi(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        params = {
            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
@ -123,7 +124,8 @@ async def async_request_trt_llm(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
@ -187,7 +189,8 @@ async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
        payload = {
@ -235,7 +238,8 @@ async def async_request_openai_completions(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "model": request_func_input.model_name \
                if request_func_input.model_name else request_func_input.model,
@ -333,7 +337,8 @@ async def async_request_openai_chat_completions(
        "chat/completions"
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -200,7 +200,7 @@ def sample_sonnet_requests(
    return sampled_requests
-def sample_mmmu_pro_vision_requests(
+def sample_vision_arena_requests(
    dataset,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
@ -212,13 +212,7 @@ def sample_mmmu_pro_vision_requests(
        if len(sampled_requests) == num_requests:
            break
-        # MMMU-Pro vision direct prompt
+        prompt = data["turns"][0][0]['content']
        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
        prompt = (
            "Answer with the option letter from the given choices directly. "
            "The last line of your response should be of the following "
            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
            "options.")
        prompt_token_ids = tokenizer(prompt).input_ids
        if fixed_output_len is None:
@ -230,10 +224,10 @@ def sample_mmmu_pro_vision_requests(
        output_len = fixed_output_len
        assert isinstance(
-            data["image"],
+            data["images"][0],
            Image), ("Input image format must be `PIL.Image.Image`, "
                     f"given {type(data['image'])}.")
-        image: Image = data["image"]
+        image: Image = data["images"][0]
        image = image.convert("RGB")
        image_data = io.BytesIO()
        image.save(image_data, format='JPEG')
@ -252,7 +246,7 @@ def sample_mmmu_pro_vision_requests(
 def sample_hf_requests(
    dataset_path: str,
-    dataset_subset: str,
+    dataset_subset: Optional[str],
    dataset_split: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
@ -260,19 +254,17 @@ def sample_hf_requests(
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
-    # Special case for MMMU-Pro vision dataset
+    # Special case for vision_arena dataset
-    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
+    if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
-        assert dataset_split == "test"
+        and dataset_subset is None:
        assert dataset_split == "train"
        dataset = load_dataset(dataset_path,
                               name=dataset_subset,
                               split=dataset_split,
                               streaming=True)
-        assert "image" in dataset.features, (
+        dataset = dataset.shuffle(seed=random_seed)
-            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
+        return sample_vision_arena_requests(dataset, num_requests, tokenizer,
-        filter_func = lambda x: isinstance(x["image"], Image)
+                                            fixed_output_len)
        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
        return sample_mmmu_pro_vision_requests(dataset, num_requests,
                                               tokenizer, fixed_output_len)
    dataset = load_dataset(dataset_path,
                           name=dataset_subset,
@ -934,8 +926,8 @@ def main(args: argparse.Namespace):
                    )
        # Traffic
-        result_json["request_rate"] = (
+        result_json["request_rate"] = (args.request_rate if args.request_rate
-            args.request_rate if args.request_rate < float("inf") else "inf")
+                                       < float("inf") else "inf")
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -450,7 +450,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
 def main(args: argparse.Namespace):
    print(args)
-    config = AutoConfig.from_pretrained(args.model)
+    config = AutoConfig.from_pretrained(
        args.model, trust_remote_code=args.trust_remote_code)
    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
@ -461,6 +462,11 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    elif config.architectures[0] == "DeepseekV3ForCausalLM":
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Default: Mixtral.
        E = config.num_local_experts
@ -538,6 +544,7 @@ if __name__ == "__main__":
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--trust-remote-code", action="store_true")
    args = parser.parse_args()
    main(args)
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -259,7 +259,7 @@ endmacro()
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
 # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
 #  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result. 
+#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@ -270,34 +270,47 @@ endmacro()
 #
 function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
  list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
  set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
  set(_CUDA_ARCHS)
  if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
    list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
-    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
      set(_CUDA_ARCHS "9.0a")
    endif()
  endif()
  list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
-  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
-  # less or eqault to ARCH
+  # is less or equal to ARCH (but has the same major version since SASS binary
-  foreach(_ARCH ${CUDA_ARCHS})
+  # compatibility is only forward compatible within the same major version).
-  set(_TMP_ARCH)
+  foreach(_ARCH ${TGT_CUDA_ARCHS_})
-  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+    set(_TMP_ARCH)
-    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+    # Extract the major version of the target arch
-      set(_TMP_ARCH ${_SRC_ARCH})
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
-    else()
+    foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
-      break()
+      # Extract the major version of the source arch
      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
      # Check major-version match AND version-less-or-equal
      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
        if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
          set(_TMP_ARCH "${_SRC_ARCH}")
        endif()
      else()
        # If we hit a version greater than the target, we can break
        break()
      endif()
    endforeach()
    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
    if (_TMP_ARCH)
      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
    endif()
  endforeach()
  if (_TMP_ARCH)
    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
  endif()
  endforeach()
  list(REMOVE_DUPLICATES _CUDA_ARCHS)
  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -28,6 +28,11 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                             const std::string& kv_cache_dtype,
                             torch::Tensor& k_scale, torch::Tensor& v_scale);
 void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                          torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
                          const std::string& kv_cache_dtype,
                          torch::Tensor& scale);
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -245,6 +245,51 @@ __global__ void reshape_and_cache_flash_kernel(
    }
  }
 }
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void concat_and_cache_mla_kernel(
    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
                                     // + pe_dim)]
    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
    const int block_stride,                    //
    const int kv_c_stride,                     //
    const int k_pe_stride,                     //
    const int kv_lora_rank,                    //
    const int pe_dim,                          //
    const int block_size,                      //
    const float* scale                         //
 ) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  // NOTE: slot_idx can be -1 if the token is padded
  if (slot_idx < 0) {
    return;
  }
  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
                  int src_stride, int dst_stride, int size, int offset) {
    for (int i = threadIdx.x; i < size; i += blockDim.x) {
      const int64_t src_idx = token_idx * src_stride + i;
      const int64_t dst_idx = block_idx * block_stride +
                              block_offset * (kv_lora_rank + pe_dim) + i +
                              offset;
      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
        dst[dst_idx] = src[src_idx];
      } else {
        dst[dst_idx] =
            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
      }
    }
  };
  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }
 }  // namespace vllm
 // KV_T is the stored data type of kv-cache.
@ -343,6 +388,56 @@ void reshape_and_cache_flash(
                             CALL_RESHAPE_AND_CACHE_FLASH);
 }
 // KV_T is the stored data type of kv-cache.
 // CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
      <<<grid, block, 0, stream>>>(                                    \
          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
          reinterpret_cast<const float*>(scale.data_ptr()));
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
                                  // pe_dim)]
    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
    const std::string& kv_cache_dtype, torch::Tensor& scale) {
  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
  // slot_mapping.size(0) because of padding for CUDA graphs.
  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
  // both include padding.
  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
  // since key includes padding for CUDA graphs, while slot_mapping does not.
  // In this case, slot_mapping.size(0) represents the actual number of tokens
  // before padding.
  // For compatibility with both cases, we use slot_mapping.size(0) as the
  // number of tokens.
  int num_tokens = slot_mapping.size(0);
  int kv_lora_rank = kv_c.size(1);
  int pe_dim = k_pe.size(1);
  int block_size = kv_cache.size(1);
  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
  int kv_c_stride = kv_c.stride(0);
  int k_pe_stride = k_pe.stride(0);
  int block_stride = kv_cache.stride(0);
  dim3 grid(num_tokens);
  dim3 block(std::min(kv_lora_rank, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
                             CALL_CONCAT_AND_CACHE_MLA);
 }
 namespace vllm {
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -38,9 +38,13 @@ struct Signal {
  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };
-struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+struct __align__(16) RankData {
  const void* __restrict__ ptrs[8];
 };
-struct __align__(16) RankSignals { Signal* signals[8]; };
+struct __align__(16) RankSignals {
  Signal* signals[8];
 };
 // like std::array, but aligned
 template <typename T, int sz>
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  const int SUB = 0x64006400;
  const int MUL = 0x2c002c00;
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -33,7 +33,9 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
  extern __shared__ int32_t shared_mem[];
  int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
+  token_cnts_t* tokens_cnts =
      (token_cnts_t*)(shared_mem + num_experts +
                      1);  // 2d tensor with shape (blockDim.x + 1, num_experts)
  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  const int SUB = 0x64006400;
  const int MUL = 0x2c002c00;
@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
  static constexpr uint32_t HI = 0x00f000f0;
  static constexpr uint32_t EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  static constexpr uint32_t SUB = 0x64086408;
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                           // max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
-    const int max_num_partitions){UNREACHABLE_CODE}
+    const int max_num_partitions) {
  UNREACHABLE_CODE
 }
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -463,6 +463,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
                 &reshape_and_cache_flash);
  // Concat kv_c and k_pe and cache them.
  cache_ops.def(
      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
      "                     Tensor! kv_cache,"
      "                     Tensor slot_mapping,"
      "                     str kv_cache_dtype,"
      "                     Tensor scale) -> ()");
  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -1,10 +1,10 @@
 sphinx==6.2.1
 sphinx-argparse==0.4.0
 sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
 sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
 myst-parser==3.0.1
 msgspec
 cloudpickle
--- a/docs/source/api/engine/index.md
+++ b/docs/source/api/engine/index.md
@ -8,10 +8,10 @@
 .. currentmodule:: vllm.engine
 ```
-```{toctree}
+:::{toctree}
 :caption: Engines
 :maxdepth: 2
 llm_engine
 async_llm_engine
-```
+:::
--- a/docs/source/api/model/index.md
+++ b/docs/source/api/model/index.md
@ -2,10 +2,10 @@
 ## Submodules
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 interfaces_base
 interfaces
 adapters
-```
+:::
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@ -17,7 +17,7 @@ Looking to add your own multi-modal model? Please follow the instructions listed
 ## Submodules
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 inputs
@ -25,4 +25,4 @@ parse
 processing
 profiling
 registry
-```
+:::
--- a/docs/source/api/offline_inference/index.md
+++ b/docs/source/api/offline_inference/index.md
@ -1,9 +1,9 @@
 # Offline Inference
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 llm
 llm_inputs
-```
+:::
--- a/docs/source/community/blog.md
+++ b/docs/source/community/blog.md
@ -0,0 +1,3 @@
 # vLLM Blog
 vLLM blog posts are published [here](https://blog.vllm.ai/).
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@ -17,11 +17,11 @@ The edges of the build graph represent:
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
-  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
+  > :::{figure} /assets/contributing/dockerfile-stages-dependency.png
  > :align: center
  > :alt: query
  > :width: 100%
-  > ```
+  > :::
  >
  > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
  >
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@ -10,9 +10,9 @@ First, clone the PyTorch model code from the source repository.
 For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
-```{warning}
+:::{warning}
 Make sure to review and adhere to the original code's copyright and licensing terms!
-```
+:::
 ## 2. Make your code compatible with vLLM
@ -80,10 +80,10 @@ def forward(
    ...
 ```
-```{note}
+:::{note}
 Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
 If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-```
+:::
 For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@ -4,7 +4,7 @@
 This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
@ -12,16 +12,16 @@ basic
 registration
 tests
 multimodal
-```
+:::
-```{note}
+:::{note}
 The complexity of adding a new model depends heavily on the model's architecture.
 The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
 However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-```
+:::
-```{tip}
+:::{tip}
 If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
 or ask on our [developer slack](https://slack.vllm.ai).
 We will be happy to help you out!
-```
+:::
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@ -48,9 +48,9 @@ Further update the model as follows:
            return vision_embeddings
    ```
-    ```{important}
+    :::{important}
    The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
-    ```
+    :::
 - Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
@ -89,10 +89,10 @@ Further update the model as follows:
  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
  ```
-  ```{note}
+  :::{note}
  The model class does not have to be named {code}`*ForCausalLM`.
  Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
-  ```
+  :::
 ## 2. Specify processing information
@ -120,8 +120,8 @@ When calling the model, the output embeddings from the visual encoder are assign
 containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
 to the size of the output embeddings.
-::::{tab-set}
+:::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 Looking at the code of HF's `LlavaForConditionalGeneration`:
@ -254,12 +254,12 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
    return {"image": self.get_max_image_tokens()}
 ```
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 ```
 :::
 ::::
 :::::
 ## 3. Specify dummy inputs
@ -315,17 +315,17 @@ def get_dummy_processor_inputs(
 Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
 to fill in the missing details about HF processing.
-```{seealso}
+:::{seealso}
 [Multi-Modal Data Processing](#mm-processing)
-```
+:::
 ### Multi-modal fields
 Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
 return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
-::::{tab-set}
+:::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 Looking at the model's `forward` method:
@ -367,13 +367,13 @@ def _get_mm_fields_config(
    )
 ```
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
 pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
 ```
 :::
 ::::
 :::::
 ### Prompt replacements
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@ -17,17 +17,17 @@ After you have implemented your model (see [tutorial](#new-model-basic)), put it
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](#supported-models) to promote your model!
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
 ## Out-of-tree models
 You can load an external model using a plugin without modifying the vLLM codebase.
-```{seealso}
+:::{seealso}
 [vLLM's Plugin System](#plugin-system)
-```
+:::
 To register the model, use the following code:
@ -45,11 +45,11 @@ from vllm import ModelRegistry
 ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
 ```
-```{important}
+:::{important}
 If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 Read more about that [here](#supports-multimodal).
-```
+:::
-```{note}
+:::{note}
 Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
-```
+:::
--- a/docs/source/contributing/model/tests.md
+++ b/docs/source/contributing/model/tests.md
@ -14,14 +14,14 @@ Without them, the CI for your PR will fail.
 Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
-```{tip}
+:::{tip}
 If your model requires a development version of HF Transformers, you can set
 `min_transformers_version` to skip the test in CI until the model is released.
-```
+:::
 ## Optional Tests
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@ -35,17 +35,17 @@ pre-commit run --all-files
 pytest tests/
 ```
-```{note}
+:::{note}
 Currently, the repository is not fully checked by `mypy`.
-```
+:::
 ## Issues
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
-```{important}
+:::{important}
 If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
-```
+:::
 ## Pull Requests & Code Reviews
@ -81,9 +81,9 @@ appropriately to indicate the type of change. Please use one of the following:
 - `[Misc]` for PRs that do not fit the above categories. Please use this
  sparingly.
-```{note}
+:::{note}
 If the PR spans more than one category, please include all relevant prefixes.
-```
+:::
 ### Code Quality
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@ -6,21 +6,21 @@ The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` en
 When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
-```{warning}
+:::{warning}
 Only enable profiling in a development environment.
-```
+:::
 Traces can be visualized using <https://ui.perfetto.dev/>.
-```{tip}
+:::{tip}
 Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-```
+:::
-```{tip}
+:::{tip}
 To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
 Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
 `export VLLM_RPC_TIMEOUT=1800000`
-```
+:::
 ## Example commands and usage
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -21,11 +21,11 @@ $ docker run --runtime nvidia --gpus all \
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
-```{note}
+:::{note}
 You can either use the `ipc=host` flag or `--shm-size` flag to allow the
 container to access the host's shared memory. vLLM uses PyTorch, which uses shared
 memory to share data between processes under the hood, particularly for tensor parallel inference.
-```
+:::
 (deployment-docker-build-image-from-source)=
@ -38,25 +38,25 @@ You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To
 DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 ```
-```{note}
+:::{note}
 By default vLLM will build for all GPU types for widest distribution. If you are just building for the
 current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
 for vLLM to find the current GPU type and build for that.
 If you are using Podman instead of Docker, you might need to disable SELinux labeling by
 adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
-```
+:::
 ## Building for Arm64/aarch64
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
 of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
-```{note}
+:::{note}
 Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
 flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
 Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-```
+:::
 ```console
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
@ -85,6 +85,6 @@ $ docker run --runtime nvidia --gpus all \
 The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
-```{note}
+:::{note}
 **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
-```
+:::
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@ -2,11 +2,11 @@
 # Cerebrium
-```{raw} html
+:::{raw} html
 <p align="center">
    <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
 </p>
-```
+:::
 vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@ -2,11 +2,11 @@
 # dstack
-```{raw} html
+:::{raw} html
 <p align="center">
    <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
 </p>
-```
+:::
 vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
@ -97,6 +97,6 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
-```{note}
+:::{note}
 dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
-```
+:::
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@ -38,213 +38,213 @@ chart **including persistent volumes** and deletes the release.
 ## Architecture
-```{image} /assets/deployment/architecture_helm_deployment.png
+:::{image} /assets/deployment/architecture_helm_deployment.png
-```
+:::
 ## Values
-```{list-table}
+:::{list-table}
 :widths: 25 25 25 25
 :header-rows: 1
-* - Key
+- * Key
-  - Type
+  * Type
-  - Default
+  * Default
-  - Description
+  * Description
-* - autoscaling
+- * autoscaling
-  - object
+  * object
-  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-  - Autoscaling configuration
+  * Autoscaling configuration
-* - autoscaling.enabled
+- * autoscaling.enabled
-  - bool
+  * bool
-  - false
+  * false
-  - Enable autoscaling
+  * Enable autoscaling
-* - autoscaling.maxReplicas
+- * autoscaling.maxReplicas
-  - int
+  * int
-  - 100
+  * 100
-  - Maximum replicas
+  * Maximum replicas
-* - autoscaling.minReplicas
+- * autoscaling.minReplicas
-  - int
+  * int
-  - 1
+  * 1
-  - Minimum replicas
+  * Minimum replicas
-* - autoscaling.targetCPUUtilizationPercentage
+- * autoscaling.targetCPUUtilizationPercentage
-  - int
+  * int
-  - 80
+  * 80
-  - Target CPU utilization for autoscaling
+  * Target CPU utilization for autoscaling
-* - configs
+- * configs
-  - object
+  * object
-  - {}
+  * {}
-  - Configmap
+  * Configmap
-* - containerPort
+- * containerPort
-  - int
+  * int
-  - 8000
+  * 8000
-  - Container port
+  * Container port
-* - customObjects
+- * customObjects
-  - list
+  * list
-  - []
+  * []
-  - Custom Objects configuration
+  * Custom Objects configuration
-* - deploymentStrategy
+- * deploymentStrategy
-  - object
+  * object
-  - {}
+  * {}
-  - Deployment strategy configuration
+  * Deployment strategy configuration
-* - externalConfigs
+- * externalConfigs
-  - list
+  * list
-  - []
+  * []
-  - External configuration
+  * External configuration
-* - extraContainers
+- * extraContainers
-  - list
+  * list
-  - []
+  * []
-  - Additional containers configuration
+  * Additional containers configuration
-* - extraInit
+- * extraInit
-  - object
+  * object
-  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-  - Additional configuration for the init container
+  * Additional configuration for the init container
-* - extraInit.pvcStorage
+- * extraInit.pvcStorage
-  - string
+  * string
-  - "50Gi"
+  * "50Gi"
-  - Storage size of the s3
+  * Storage size of the s3
-* - extraInit.s3modelpath
+- * extraInit.s3modelpath
-  - string
+  * string
-  - "relative_s3_model_path/opt-125m"
+  * "relative_s3_model_path/opt-125m"
-  - Path of the model on the s3 which hosts model weights and config files
+  * Path of the model on the s3 which hosts model weights and config files
-* - extraInit.awsEc2MetadataDisabled
+- * extraInit.awsEc2MetadataDisabled
-  - boolean
+  * boolean
-  - true
+  * true
-  - Disables the use of the Amazon EC2 instance metadata service
+  * Disables the use of the Amazon EC2 instance metadata service
-* - extraPorts
+- * extraPorts
-  - list
+  * list
-  - []
+  * []
-  - Additional ports configuration
+  * Additional ports configuration
-* - gpuModels
+- * gpuModels
-  - list
+  * list
-  - ["TYPE_GPU_USED"]
+  * ["TYPE_GPU_USED"]
-  - Type of gpu used
+  * Type of gpu used
-* - image
+- * image
-  - object
+  * object
-  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-  - Image configuration
+  * Image configuration
-* - image.command
+- * image.command
-  - list
+  * list
-  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-  - Container launch command
+  * Container launch command
-* - image.repository
+- * image.repository
-  - string
+  * string
-  - "vllm/vllm-openai"
+  * "vllm/vllm-openai"
-  - Image repository
+  * Image repository
-* - image.tag
+- * image.tag
-  - string
+  * string
-  - "latest"
+  * "latest"
-  - Image tag
+  * Image tag
-* - livenessProbe
+- * livenessProbe
-  - object
+  * object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-  - Liveness probe configuration
+  * Liveness probe configuration
-* - livenessProbe.failureThreshold
+- * livenessProbe.failureThreshold
-  - int
+  * int
-  - 3
+  * 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-* - livenessProbe.httpGet
+- * livenessProbe.httpGet
-  - object
+  * object
-  - {"path":"/health","port":8000}
+  * {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
+  * Configuration of the Kubelet http request on the server
-* - livenessProbe.httpGet.path
+- * livenessProbe.httpGet.path
-  - string
+  * string
-  - "/health"
+  * "/health"
-  - Path to access on the HTTP server
+  * Path to access on the HTTP server
-* - livenessProbe.httpGet.port
+- * livenessProbe.httpGet.port
-  - int
+  * int
-  - 8000
+  * 8000
-  - Name or number of the port to access on the container, on which the server is listening
+  * Name or number of the port to access on the container, on which the server is listening
-* - livenessProbe.initialDelaySeconds
+- * livenessProbe.initialDelaySeconds
-  - int
+  * int
-  - 15
+  * 15
-  - Number of seconds after the container has started before liveness probe is initiated
+  * Number of seconds after the container has started before liveness probe is initiated
-* - livenessProbe.periodSeconds
+- * livenessProbe.periodSeconds
-  - int
+  * int
-  - 10
+  * 10
-  - How often (in seconds) to perform the liveness probe
+  * How often (in seconds) to perform the liveness probe
-* - maxUnavailablePodDisruptionBudget
+- * maxUnavailablePodDisruptionBudget
-  - string
+  * string
-  - ""
+  * ""
-  - Disruption Budget Configuration
+  * Disruption Budget Configuration
-* - readinessProbe
+- * readinessProbe
-  - object
+  * object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-  - Readiness probe configuration
+  * Readiness probe configuration
-* - readinessProbe.failureThreshold
+- * readinessProbe.failureThreshold
-  - int
+  * int
-  - 3
+  * 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-* - readinessProbe.httpGet
+- * readinessProbe.httpGet
-  - object
+  * object
-  - {"path":"/health","port":8000}
+  * {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
+  * Configuration of the Kubelet http request on the server
-* - readinessProbe.httpGet.path
+- * readinessProbe.httpGet.path
-  - string
+  * string
-  - "/health"
+  * "/health"
-  - Path to access on the HTTP server
+  * Path to access on the HTTP server
-* - readinessProbe.httpGet.port
+- * readinessProbe.httpGet.port
-  - int
+  * int
-  - 8000
+  * 8000
-  - Name or number of the port to access on the container, on which the server is listening
+  * Name or number of the port to access on the container, on which the server is listening
-* - readinessProbe.initialDelaySeconds
+- * readinessProbe.initialDelaySeconds
-  - int
+  * int
-  - 5
+  * 5
-  - Number of seconds after the container has started before readiness probe is initiated
+  * Number of seconds after the container has started before readiness probe is initiated
-* - readinessProbe.periodSeconds
+- * readinessProbe.periodSeconds
-  - int
+  * int
-  - 5
+  * 5
-  - How often (in seconds) to perform the readiness probe
+  * How often (in seconds) to perform the readiness probe
-* - replicaCount
+- * replicaCount
-  - int
+  * int
-  - 1
+  * 1
-  - Number of replicas
+  * Number of replicas
-* - resources
+- * resources
-  - object
+  * object
-  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-  - Resource configuration
+  * Resource configuration
-* - resources.limits."nvidia.com/gpu"
+- * resources.limits."nvidia.com/gpu"
-  - int
+  * int
-  - 1
+  * 1
-  - Number of gpus used
+  * Number of gpus used
-* - resources.limits.cpu
+- * resources.limits.cpu
-  - int
+  * int
-  - 4
+  * 4
-  - Number of CPUs
+  * Number of CPUs
-* - resources.limits.memory
+- * resources.limits.memory
-  - string
+  * string
-  - "16Gi"
+  * "16Gi"
-  - CPU memory configuration
+  * CPU memory configuration
-* - resources.requests."nvidia.com/gpu"
+- * resources.requests."nvidia.com/gpu"
-  - int
+  * int
-  - 1
+  * 1
-  - Number of gpus used
+  * Number of gpus used
-* - resources.requests.cpu
+- * resources.requests.cpu
-  - int
+  * int
-  - 4
+  * 4
-  - Number of CPUs
+  * Number of CPUs
-* - resources.requests.memory
+- * resources.requests.memory
-  - string
+  * string
-  - "16Gi"
+  * "16Gi"
-  - CPU memory configuration
+  * CPU memory configuration
-* - secrets
+- * secrets
-  - object
+  * object
-  - {}
+  * {}
-  - Secrets configuration
+  * Secrets configuration
-* - serviceName
+- * serviceName
-  - string
+  * string
-  -
+  *
-  - Service name
+  * Service name
-* - servicePort
+- * servicePort
-  - int
+  * int
-  - 80
+  * 80
-  - Service port
+  * Service port
-* - labels.environment
+- * labels.environment
-  - string
+  * string
-  - test
+  * test
-  - Environment name
+  * Environment name
-* - labels.release
+- * labels.release
-  - string
+  * string
-  - test
+  * test
-  - Release name
+  * Release name
-```
+:::
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@ -1,6 +1,6 @@
 # Using other frameworks
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 bentoml
@ -11,4 +11,4 @@ lws
 modal
 skypilot
 triton
-```
+:::
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@ -2,11 +2,11 @@
 # SkyPilot
-```{raw} html
+:::{raw} html
 <p align="center">
  <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
-```
+:::
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
@ -104,10 +104,10 @@ service:
  max_completion_tokens: 1
 ```
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 ```yaml
 service:
@ -153,9 +153,9 @@ run: |
    2>&1 | tee api_server.log
 ```
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 Start the serving the Llama-3 8B model on multiple replicas:
@ -169,10 +169,10 @@ Wait until the service is ready:
 watch -n10 sky serve status vllm
 ```
-```{raw} html
+:::{raw} html
 <details>
 <summary>Example outputs:</summary>
-```
+:::
 ```console
 Services
@ -185,9 +185,9 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
@ -223,10 +223,10 @@ service:
 This will scale the service up to when the QPS exceeds 2 for each replica.
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 ```yaml
 service:
@ -275,9 +275,9 @@ run: |
    2>&1 | tee api_server.log
 ```
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 To update the service with the new config:
@ -295,10 +295,10 @@ sky serve down vllm
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full GUI YAML</summary>
-```
+:::
 ```yaml
 envs:
@ -328,9 +328,9 @@ run: |
    --stop-token-ids 128009,128001 | tee ~/gradio.log
 ```
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 1. Start the chat web UI:
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@ -1,9 +1,9 @@
 # External Integrations
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 kserve
 kubeai
 llamastack
-```
+:::
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@ -105,9 +105,9 @@ docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-si
 docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
 ```
-```{note}
+:::{note}
 If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
-```
+:::
 (nginxloadbalancer-nginx-launch-nginx)=
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@ -4,19 +4,19 @@
 This document provides an overview of the vLLM architecture.
-```{contents} Table of Contents
+:::{contents} Table of Contents
 :depth: 2
 :local: true
-```
+:::
 ## Entrypoints
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
-```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
 :alt: Entrypoints Diagram
-```
+:::
 ### LLM Class
@ -84,9 +84,9 @@ More details on the API server can be found in the [OpenAI-Compatible Server](#o
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
-```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
 :alt: LLMEngine Diagram
-```
+:::
 ### LLMEngine
@ -144,11 +144,11 @@ configurations affect the class we ultimately get.
 The following figure shows the class hierarchy of vLLM:
-> ```{figure} /assets/design/hierarchy.png
+> :::{figure} /assets/design/hierarchy.png
 > :align: center
 > :alt: query
 > :width: 100%
-> ```
+> :::
 There are several important design choices behind this class hierarchy:
@ -178,7 +178,7 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
-````{note}
+:::{note}
 To support this change, all vLLM models' signatures have been updated to:
 ```python
@ -215,7 +215,7 @@ else:
 ```
 This way, the model can work with both old and new versions of vLLM.
-````
+:::
 3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@ -139,26 +139,26 @@
  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
  ```
-  ```{figure} ../../assets/kernel/query.png
+  :::{figure} ../../assets/kernel/query.png
  :align: center
  :alt: query
  :width: 70%
  Query data of one token at one head
-  ```
+  :::
 - Each thread defines its own `q_ptr` which points to the assigned
  query token data on global memory. For example, if `VEC_SIZE` is 4
  and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
  total of 128 elements divided into 128 / 4 = 32 vecs.
-  ```{figure} ../../assets/kernel/q_vecs.png
+  :::{figure} ../../assets/kernel/q_vecs.png
  :align: center
  :alt: q_vecs
  :width: 70%
  `q_vecs` for one thread group
-  ```
+  :::
  ```cpp
  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@ -195,13 +195,13 @@
  points to key token data based on `k_cache` at assigned block,
  assigned head and assigned token.
-  ```{figure} ../../assets/kernel/key.png
+  :::{figure} ../../assets/kernel/key.png
  :align: center
  :alt: key
  :width: 70%
  Key data of all context tokens at one head
-  ```
+  :::
 - The diagram above illustrates the memory layout for key data. It
  assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@ -214,13 +214,13 @@
  elements for one token) that will be processed by 2 threads (one
  thread group) separately.
-  ```{figure} ../../assets/kernel/k_vecs.png
+  :::{figure} ../../assets/kernel/k_vecs.png
  :align: center
  :alt: k_vecs
  :width: 70%
  `k_vecs` for one thread
-  ```
+  :::
  ```cpp
  K_vec k_vecs[NUM_VECS_PER_THREAD]
@ -289,14 +289,14 @@
  should be performed across the entire thread block, encompassing
  results between the query token and all context key tokens.
-  ```{math}
+  :::{math}
  :nowrap: true
  \begin{gather*}
  m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
  \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
  \end{gather*}
-  ```
+  :::
 ### `qk_max` and `logits`
@ -379,29 +379,29 @@
 ## Value
-```{figure} ../../assets/kernel/value.png
+:::{figure} ../../assets/kernel/value.png
 :align: center
 :alt: value
 :width: 70%
 Value data of all context tokens at one head
-```
+:::
-```{figure} ../../assets/kernel/logits_vec.png
+:::{figure} ../../assets/kernel/logits_vec.png
 :align: center
 :alt: logits_vec
 :width: 50%
 `logits_vec` for one thread
-```
+:::
-```{figure} ../../assets/kernel/v_vec.png
+:::{figure} ../../assets/kernel/v_vec.png
 :align: center
 :alt: v_vec
 :width: 70%
 List of `v_vec` for one thread
-```
+:::
 - Now we need to retrieve the value data and perform dot multiplication
  with `logits`. Unlike query and key, there is no thread group
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@ -7,9 +7,9 @@ page for information on known issues and how to solve them.
 ## Introduction
-```{important}
+:::{important}
 The source code references are to the state of the code at the time of writing in December, 2024.
-```
+:::
 The use of Python multiprocessing in vLLM is complicated by:
--- a/docs/source/features/automatic_prefix_caching.md
+++ b/docs/source/features/automatic_prefix_caching.md
@ -6,9 +6,9 @@
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
-```{note}
+:::{note}
 Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
-```
+:::
 ## Enabling APC in vLLM
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@ -4,13 +4,13 @@
 The tables below show mutually exclusive features and the support on some hardware.
-```{note}
+:::{note}
 Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
-```
+:::
 ## Feature x Feature
-```{raw} html
+:::{raw} html
 <style>
  /* Make smaller to try to improve readability  */
  td {
@ -23,448 +23,447 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
    font-size: 0.8rem;
  }
 </style>
-```
+:::
-```{list-table}
+:::{list-table}
-   :header-rows: 1
+:header-rows: 1
-   :stub-columns: 1
+:stub-columns: 1
-   :widths: auto
+:widths: auto
-   * - Feature
+- * Feature
-     - [CP](#chunked-prefill)
+  * [CP](#chunked-prefill)
-     - [APC](#automatic-prefix-caching)
+  * [APC](#automatic-prefix-caching)
-     - [LoRA](#lora-adapter)
+  * [LoRA](#lora-adapter)
-     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - [SD](#spec_decode)
+  * [SD](#spec_decode)
-     - CUDA graph
+  * CUDA graph
-     - <abbr title="Pooling Models">pooling</abbr>
+  * <abbr title="Pooling Models">pooling</abbr>
-     - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - <abbr title="Logprobs">logP</abbr>
+  * <abbr title="Logprobs">logP</abbr>
-     - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - <abbr title="Async Output Processing">async output</abbr>
+  * <abbr title="Async Output Processing">async output</abbr>
-     - multi-step
+  * multi-step
-     - <abbr title="Multimodal Inputs">mm</abbr>
+  * <abbr title="Multimodal Inputs">mm</abbr>
-     - best-of
+  * best-of
-     - beam-search
+  * beam-search
-     - <abbr title="Guided Decoding">guided dec</abbr>
+  * <abbr title="Guided Decoding">guided dec</abbr>
-   * - [CP](#chunked-prefill)
+- * [CP](#chunked-prefill)
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - [APC](#automatic-prefix-caching)
+- * [APC](#automatic-prefix-caching)
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - [LoRA](#lora-adapter)
+- * [LoRA](#lora-adapter)
-     - [✗](gh-pr:9057)
+  * [✗](gh-pr:9057)
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - [SD](#spec_decode)
+- * [SD](#spec_decode)
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - CUDA graph
+- * CUDA graph
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - <abbr title="Pooling Models">pooling</abbr>
+- * <abbr title="Pooling Models">pooling</abbr>
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✗
+  * ✗
-     - [✗](gh-issue:7366)
+  * [✗](gh-issue:7366)
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     - [✗](gh-issue:7366)
+  * [✗](gh-issue:7366)
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - <abbr title="Logprobs">logP</abbr>
+- * <abbr title="Logprobs">logP</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - [✗](gh-pr:8199)
+  * [✗](gh-pr:8199)
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - <abbr title="Async Output Processing">async output</abbr>
+- * <abbr title="Async Output Processing">async output</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - multi-step
+- * multi-step
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - [✗](gh-issue:8198)
+  * [✗](gh-issue:8198)
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - <abbr title="Multimodal Inputs">mm</abbr>
+- * <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
+  * ✅
-     -  [✗](gh-pr:8348)
+  * [✗](gh-pr:8348)
-     -  [✗](gh-pr:7199)
+  * [✗](gh-pr:7199)
-     - ?
+  * ?
-     - ?
+  * ?
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ?
+  * ?
-     -
+  *
-     -
+  *
-     -
+  *
-     -
+  *
-   * - best-of
+- * best-of
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - [✗](gh-issue:6137)
+  * [✗](gh-issue:6137)
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ?
+  * ?
-     - [✗](gh-issue:7968)
+  * [✗](gh-issue:7968)
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-     -
+  *
-   * - beam-search
+- * beam-search
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - [✗](gh-issue:6137)
+  * [✗](gh-issue:6137)
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ?
+  * ?
-     - [✗](gh-issue:7968>)
+  * [✗](gh-issue:7968>)
-     - ?
+  * ?
-     - ✅
+  * ✅
-     -
+  *
-     -
+  *
-   * - <abbr title="Guided Decoding">guided dec</abbr>
+- * <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ?
+  * ?
-     - ?
+  * ?
-     - [✗](gh-issue:11484)
+  * [✗](gh-issue:11484)
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ?
+  * ?
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - [✗](gh-issue:9893)
+  * [✗](gh-issue:9893)
-     - ?
+  * ?
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     -
+  *
-
+:::
 ```
 (feature-x-hardware)=
 ## Feature x Hardware
-```{list-table}
+:::{list-table}
-   :header-rows: 1
+:header-rows: 1
-   :stub-columns: 1
+:stub-columns: 1
-   :widths: auto
+:widths: auto
-   * - Feature
+- * Feature
-     - Volta
+  * Volta
-     - Turing
+  * Turing
-     - Ampere
+  * Ampere
-     - Ada
+  * Ada
-     - Hopper
+  * Hopper
-     - CPU
+  * CPU
-     - AMD
+  * AMD
-   * - [CP](#chunked-prefill)
+- * [CP](#chunked-prefill)
-     - [✗](gh-issue:2729)
+  * [✗](gh-issue:2729)
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - [APC](#automatic-prefix-caching)
+- * [APC](#automatic-prefix-caching)
-     - [✗](gh-issue:3687)
+  * [✗](gh-issue:3687)
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - [LoRA](#lora-adapter)
+- * [LoRA](#lora-adapter)
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - [✗](gh-issue:8475)
+  * [✗](gh-issue:8475)
-     - ✅
+  * ✅
-   * - [SD](#spec_decode)
+- * [SD](#spec_decode)
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - CUDA graph
+- * CUDA graph
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✅
+  * ✅
-   * - <abbr title="Pooling Models">pooling</abbr>
+- * <abbr title="Pooling Models">pooling</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ?
+  * ?
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-   * - <abbr title="Multimodal Inputs">mm</abbr>
+- * <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - <abbr title="Logprobs">logP</abbr>
+- * <abbr title="Logprobs">logP</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - <abbr title="Async Output Processing">async output</abbr>
+- * <abbr title="Async Output Processing">async output</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✗
+  * ✗
-     - ✗
+  * ✗
-   * - multi-step
+- * multi-step
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - [✗](gh-issue:8477)
+  * [✗](gh-issue:8477)
-     - ✅
+  * ✅
-   * - best-of
+- * best-of
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - beam-search
+- * beam-search
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-   * - <abbr title="Guided Decoding">guided dec</abbr>
+- * <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-     - ✅
+  * ✅
-```
+:::
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@ -4,9 +4,9 @@
 This page introduces you the disaggregated prefilling feature in vLLM.
-```{note}
+:::{note}
 This feature is experimental and subject to change.
-```
+:::
 ## Why disaggregated prefilling?
@ -15,9 +15,9 @@ Two main reasons:
 - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
 - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
-```{note}
+:::{note}
 Disaggregated prefill DOES NOT improve throughput.
-```
+:::
 ## Usage example
@ -39,21 +39,21 @@ Key abstractions for disaggregated prefilling:
 - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
 - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
-```{note}
+:::{note}
 `insert` is non-blocking operation but `drop_select` is blocking operation.
-```
+:::
 Here is a figure illustrating how the above 3 abstractions are organized:
-```{image} /assets/features/disagg_prefill/abstraction.jpg
+:::{image} /assets/features/disagg_prefill/abstraction.jpg
 :alt: Disaggregated prefilling abstractions
-```
+:::
 The workflow of disaggregated prefilling is as follows:
-```{image} /assets/features/disagg_prefill/overview.jpg
+:::{image} /assets/features/disagg_prefill/overview.jpg
 :alt: Disaggregated prefilling workflow
-```
+:::
 The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@ -60,9 +60,9 @@ vllm serve meta-llama/Llama-2-7b-hf \
    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
 ```
-```{note}
+:::{note}
 The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-```
+:::
 The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@ -2,11 +2,11 @@
 # AutoAWQ
-```{warning}
+:::{warning}
 Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
 accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
 inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-```
+:::
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@ -14,10 +14,10 @@ The FP8 types typically supported in hardware have two distinct representations,
 - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
 - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
-```{note}
+:::{note}
 FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
 FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
-```
+:::
 ## Quick Start with Online Dynamic Quantization
@ -32,9 +32,9 @@ model = LLM("facebook/opt-125m", quantization="fp8")
 result = model.generate("Hello, my name is")
 ```
-```{warning}
+:::{warning}
 Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-```
+:::
 ## Installation
@ -110,9 +110,9 @@ model.generate("Hello my name is")
 Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
-```
+:::
 ```console
 $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
@ -137,10 +137,10 @@ If you encounter any issues or have feature requests, please open an issue on th
 ## Deprecated Flow
-```{note}
+:::{note}
 The following information is preserved for reference and search purposes.
 The quantization method described below is deprecated in favor of the `llmcompressor` method described above.
-```
+:::
 For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8).
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@ -2,13 +2,13 @@
 # GGUF
-```{warning}
+:::{warning}
 Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-```
+:::
-```{warning}
+:::{warning}
 Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
-```
+:::
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
@ -25,9 +25,9 @@ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inferen
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
 ```
-```{warning}
+:::{warning}
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-```
+:::
 You can also use the GGUF model directly through the LLM entrypoint:
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@ -4,7 +4,7 @@
 Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
@ -15,4 +15,4 @@ gguf
 int8
 fp8
 quantized_kvcache
-```
+:::
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@ -7,9 +7,9 @@ This quantization method is particularly useful for reducing model size while ma
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
-```{note}
+:::{note}
 INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
-```
+:::
 ## Prerequisites
@ -119,9 +119,9 @@ $ lm_eval --model vllm \
  --batch_size 'auto'
 ```
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
-```
+:::
 ## Best Practices
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@ -4,128 +4,129 @@
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-```{list-table}
+:::{list-table}
 :header-rows: 1
 :widths: 20 8 8 8 8 8 8 8 8 8 8
-* - Implementation
+- * Implementation
-  - Volta
+  * Volta
-  - Turing
+  * Turing
-  - Ampere
+  * Ampere
-  - Ada
+  * Ada
-  - Hopper
+  * Hopper
-  - AMD GPU
+  * AMD GPU
-  - Intel GPU
+  * Intel GPU
-  - x86 CPU
+  * x86 CPU
-  - AWS Inferentia
+  * AWS Inferentia
-  - Google TPU
+  * Google TPU
-* - AWQ
+- * AWQ
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - GPTQ
+- * GPTQ
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - Marlin (GPTQ/AWQ/FP8)
+- * Marlin (GPTQ/AWQ/FP8)
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - INT8 (W8A8)
+- * INT8 (W8A8)
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - FP8 (W8A8)
+- * FP8 (W8A8)
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - AQLM
+- * AQLM
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - bitsandbytes
+- * bitsandbytes
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - DeepSpeedFP
+- * DeepSpeedFP
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-* - GGUF
+- * GGUF
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-```
+
 :::
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - "✅︎" indicates that the quantization method is supported on the specified hardware.
 - "✗" indicates that the quantization method is not supported on the specified hardware.
-```{note}
+:::{note}
 This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
-```
+:::
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@ -0,0 +1,151 @@
 (reasoning-outputs)=
 # Reasoning Outputs
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
 ## Supported Models
 vLLM currently supports the following reasoning models:
 - [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for `<think> ... </think>`)
 ## Quickstart
 To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
    --enable-reasoning --reasoning-parser deepseek_r1
 ```
 Next, make a request to the model that should return the reasoning content in the response.
 ```python
 from openai import OpenAI
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
 )
 models = client.models.list()
 model = models.data[0].id
 # Round 1
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
 response = client.chat.completions.create(model=model, messages=messages)
 reasoning_content = response.choices[0].message.reasoning_content
 content = response.choices[0].message.content
 print("reasoning_content:", reasoning_content)
 print("content:", content)
 ```
 The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
 ## Streaming chat completions
 Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
 ```json
 {
    "id": "chatcmpl-123",
    "object": "chat.completion.chunk",
    "created": 1694268190,
    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "system_fingerprint": "fp_44709d6fcb",
    "choices": [
        {
            "index": 0,
            "delta": {
                "role": "assistant",
                "reasoning_content": "is",
            },
            "logprobs": null,
            "finish_reason": null
        }
    ]
 }
 ```
 Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests.
 ## How to support a new reasoning model
 You can add a new `ReasoningParser` similar to `vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py`.
 ```python
 # import the required packages
 from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
    ReasoningParser, ReasoningParserManager)
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage)
 # define a reasoning parser and register it to vllm
 # the name list in register_module can be used
 # in --reasoning-parser.
@ReasoningParserManager.register_module(["example"])
 class ExampleParser(ReasoningParser):
    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
    def extract_reasoning_content_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> Union[DeltaMessage, None]:
        """
        Instance method that should be implemented for extracting reasoning
        from an incomplete response; for use when handling reasoning calls and
        streaming. Has to be an instance method because  it requires state -
        the current tokens/diffs, but also the information about what has
        previously been parsed and extracted (see constructor)
        """
    def extract_reasoning_content(
            self, model_output: str, request: ChatCompletionRequest
    ) -> Tuple[Optional[str], Optional[str]]:
        """
        Extract reasoning content from a complete model-generated string.
        Used for non-streaming responses where we have the entire model response
        available before sending to the client.
        Parameters:
        model_output: str
            The model-generated string to extract reasoning content from.
        request: ChatCompletionRequest
            The request object that was used to generate the model_output.
        Returns:
        Tuple[Optional[str], Optional[str]]
            A tuple containing the reasoning content and the content.
        """
 ```
 After defining the reasoning parser, you can use it by specifying the `--reasoning-parser` flag when making a request to the chat completion endpoint.
 ```bash
 vllm serve <model_tag> \
    --enable-reasoning --reasoning-parser example
 ```
 ## Limitations
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
 - It is not compatible with the [`structured_outputs`](#structured_outputs) and [`tool_calling`](#tool_calling) features.
 - The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -2,15 +2,15 @@
 # Speculative Decoding
-```{warning}
+:::{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
 not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
 The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
-```
+:::
-```{warning}
+:::{warning}
 Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
-```
+:::
 This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@ -95,10 +95,10 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
-```{tip}
+:::{tip}
 While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
 This can improve the results notably in most cases.
-```
+:::
 Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
 It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@ -57,9 +57,9 @@ class Index:
    def generate(self) -> str:
        content = f"# {self.title}\n\n{self.description}\n\n"
-        content += "```{toctree}\n"
+        content += ":::{toctree}\n"
        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(self.documents) + "\n```\n"
+        content += "\n".join(self.documents) + "\n:::\n"
        return content
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@ -59,6 +59,7 @@ To build and install vLLM from source, run:
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 pip install -r requirements-hpu.txt
 python setup.py develop
 ```
@ -68,6 +69,7 @@ Currently, the latest features and performance optimizations are developed in Ga
 git clone https://github.com/HabanaAI/vllm-fork.git
 cd vllm-fork
 git checkout habana_main
 pip install -r requirements-hpu.txt
 python setup.py develop
 ```
@ -84,9 +86,9 @@ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
-```{tip}
+:::{tip}
 If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-```
+:::
 ## Extra information
@ -153,30 +155,30 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
-```{list-table} vLLM execution modes
+:::{list-table} vLLM execution modes
 :widths: 25 25 50
 :header-rows: 1
-* - `PT_HPU_LAZY_MODE`
+- * `PT_HPU_LAZY_MODE`
-  - `enforce_eager`
+  * `enforce_eager`
-  - execution mode
+  * execution mode
-* - 0
+- * 0
-  - 0
+  * 0
-  - torch.compile
+  * torch.compile
-* - 0
+- * 0
-  - 1
+  * 1
-  - PyTorch eager mode
+  * PyTorch eager mode
-* - 1
+- * 1
-  - 0
+  * 0
-  - HPU Graphs
+  * HPU Graphs
-* - 1
+- * 1
-  - 1
+  * 1
-  - PyTorch lazy mode
+  * PyTorch lazy mode
-```
+:::
-```{warning}
+:::{warning}
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-```
+:::
 (gaudi-bucketing-mechanism)=
@ -185,9 +187,9 @@ In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
-```{note}
+:::{note}
 Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-```
+:::
 Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
@ -220,15 +222,15 @@ min = 128, step = 128, max = 512
 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
-```{warning}
+:::{warning}
 If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-```
+:::
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
-```{note}
+:::{note}
 Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-```
+:::
 ### Warmup
@ -250,9 +252,9 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size
 This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
-```{tip}
+:::{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-```
+:::
 ### HPU Graph capture
@ -267,9 +269,9 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil
 Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
-```{note}
+:::{note}
 `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-```
+:::
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
 \- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
@ -277,9 +279,9 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
-```{note}
+:::{note}
 `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-```
+:::
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
@ -350,13 +352,13 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
-  - `{phase}` is either `PROMPT` or `DECODE`
+  * `{phase}` is either `PROMPT` or `DECODE`
-  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+  * `{dim}` is either `BS`, `SEQ` or `BLOCK`
-  - `{param}` is either `MIN`, `STEP` or `MAX`
+  * `{param}` is either `MIN`, `STEP` or `MAX`
-  - Default values:
+  * Default values:
    - Prompt:
      - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@ -2,374 +2,374 @@
 vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} openvino.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} openvino.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
 ::::
 :::::
 ## Requirements
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} openvino.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} openvino.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
 ::::
 :::::
 ## Configure a new environment
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} ../python_env_setup.inc.md
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} ../python_env_setup.inc.md
 :::
 ::::
 :::::
 ## Set up using Python
 ### Pre-built wheels
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} openvino.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} openvino.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
 ::::
 :::::
 ### Build wheel from source
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} openvino.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} openvino.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
 ::::
 :::::
 ## Set up using Docker
 ### Pre-built images
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} openvino.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} openvino.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
 ::::
 :::::
 ### Build image from source
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} openvino.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} openvino.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 :::
 ::::
 :::::
 ## Extra information
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Extra information"
 ```
 :::
 :::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 ```{include} hpu-gaudi.inc.md
 :start-after: "## Extra information"
 ```
 :::
 :::{tab-item} Neuron
 :sync: neuron
 ```{include} neuron.inc.md
 :start-after: "## Extra information"
 ```
 :::
 :::{tab-item} OpenVINO
 :sync: openvino
 ```{include} openvino.inc.md
 :start-after: "## Extra information"
 ```
 :::
 ::::
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 :::{include} hpu-gaudi.inc.md
 :start-after: "## Extra information"
 :::
 ::::
 ::::{tab-item} Neuron
 :sync: neuron
 :::{include} neuron.inc.md
 :start-after: "## Extra information"
 :::
 ::::
 ::::{tab-item} OpenVINO
 :sync: openvino
 :::{include} openvino.inc.md
 :start-after: "## Extra information"
 :::
 ::::
 :::::
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@ -67,9 +67,9 @@ Currently, there are no pre-built Neuron wheels.
 ### Build wheel from source
-```{note}
+:::{note}
 The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-```
+:::
 Following instructions are applicable to Neuron SDK 2.16 and beyond.
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@ -47,10 +47,10 @@ When you request queued resources, the request is added to a queue maintained by
 the Cloud TPU service. When the requested resource becomes available, it's
 assigned to your Google Cloud project for your immediate exclusive use.
-```{note}
+:::{note}
 In all of the following commands, replace the ALL CAPS parameter names with
 appropriate values. See the parameter descriptions table for more information.
-```
+:::
 ### Provision Cloud TPUs with GKE
@ -75,33 +75,33 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
-```{list-table} Parameter descriptions
+:::{list-table} Parameter descriptions
 :header-rows: 1
-* - Parameter name
+- * Parameter name
-  - Description
+  * Description
-* - QUEUED_RESOURCE_ID
+- * QUEUED_RESOURCE_ID
-  - The user-assigned ID of the queued resource request.
+  * The user-assigned ID of the queued resource request.
-* - TPU_NAME
+- * TPU_NAME
-  - The user-assigned name of the TPU which is created when the queued
+  * The user-assigned name of the TPU which is created when the queued
    resource request is allocated.
-* - PROJECT_ID
+- * PROJECT_ID
-  - Your Google Cloud project
+  * Your Google Cloud project
-* - ZONE
+- * ZONE
-  - The GCP zone where you want to create your Cloud TPU. The value you use
+  * The GCP zone where you want to create your Cloud TPU. The value you use
    depends on the version of TPUs you are using. For more information, see
    `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-* - ACCELERATOR_TYPE
+- * ACCELERATOR_TYPE
-  - The TPU version you want to use. Specify the TPU version, for example
+  * The TPU version you want to use. Specify the TPU version, for example
    `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
    see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-* - RUNTIME_VERSION
+- * RUNTIME_VERSION
-  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+  * The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-* - SERVICE_ACCOUNT
+- * SERVICE_ACCOUNT
-  - The email address for your service account. You can find it in the IAM
+  * The email address for your service account. You can find it in the IAM
    Cloud Console under *Service Accounts*. For example:
    `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
-```
+:::
 Connect to your TPU using SSH:
@ -178,15 +178,15 @@ Run the Docker image with the following command:
 docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
-```{note}
+:::{note}
 Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
 possible input shapes and compiles an XLA graph for each shape. The
 compilation time may take 20~30 minutes in the first run. However, the
 compilation time reduces to ~5 minutes afterwards because the XLA graphs are
 cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
-```
+:::
-````{tip}
+:::{tip}
 If you encounter the following error:
 ```console
@ -198,9 +198,10 @@ file or directory
 Install OpenBLAS with the following command:
 ```console
-$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
-````
+
 :::
 ## Extra information
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@ -25,9 +25,9 @@ pip install -r requirements-cpu.txt
 pip install -e . 
 ```
-```{note}
+:::{note}
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
-```
+:::
 #### Troubleshooting
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@ -2,86 +2,86 @@
 vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 :::{tab-item} ARM
 :sync: arm
 ```{include} arm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 :::{tab-item} Apple silicon
 :sync: apple
 ```{include} apple.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 ::::
 ::::{tab-item} ARM
 :sync: arm
 :::{include} arm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
 ::::
 ::::{tab-item} Apple silicon
 :sync: apple
 :::{include} apple.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
 ::::
 :::::
 ## Requirements
 - Python: 3.9 -- 3.12
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 ```
 :::
 :::{tab-item} ARM
 :sync: arm
 ```{include} arm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 ```
 :::
 :::{tab-item} Apple silicon
 :sync: apple
 ```{include} apple.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 ```
 :::
 ::::
 ::::{tab-item} ARM
 :sync: arm
 :::{include} arm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
 ::::
 ::::{tab-item} Apple silicon
 :sync: apple
 :::{include} apple.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
 ::::
 :::::
 ## Set up using Python
 ### Create a new Python environment
-```{include} ../python_env_setup.inc.md
+:::{include} ../python_env_setup.inc.md
-```
+:::
 ### Pre-built wheels
@ -89,41 +89,41 @@ Currently, there are no pre-built CPU wheels.
 ### Build wheel from source
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 :::{tab-item} ARM
 :sync: arm
 ```{include} arm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 :::{tab-item} Apple silicon
 :sync: apple
 ```{include} apple.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 ::::
 ::::{tab-item} ARM
 :sync: arm
 :::{include} arm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
 ::::
 ::::{tab-item} Apple silicon
 :sync: apple
 :::{include} apple.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
 ::::
 :::::
 ## Set up using Docker
 ### Pre-built images
@ -142,9 +142,9 @@ $ docker run -it \
             vllm-cpu-env
 ```
-:::{tip}
+::::{tip}
 For ARM or Apple silicon, use `Dockerfile.arm`
-:::
+::::
 ## Supported features
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@ -17,10 +17,10 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 :::{include} build.inc.md
 :::
-```{note}
+:::{note}
- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
 - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
-```
+:::
 ## Set up using Docker
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@ -10,9 +10,9 @@ vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
 ### Create a new Python environment
-```{note}
+:::{note}
 PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
-```
+:::
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
@ -100,10 +100,10 @@ pip install --editable .
 You can find more information about vLLM's wheels in <project:#install-the-latest-code>.
-```{note}
+:::{note}
 There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
 It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to <project:#install-the-latest-code> for instructions on how to install a specified wheel.
-```
+:::
 #### Full build (with compilation)
@ -115,7 +115,7 @@ cd vllm
 pip install -e .
 ```
-```{tip}
+:::{tip}
 Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
@ -123,7 +123,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
-```
+:::
 ##### Use an existing PyTorch installation
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu/index.md
@ -2,299 +2,299 @@
 vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 :::{tab-item} ROCm
 :sync: rocm
 ```{include} rocm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 :::{tab-item} XPU
 :sync: xpu
 ```{include} xpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 ```
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 :::{include} rocm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
 ::::
 ::::{tab-item} XPU
 :sync: xpu
 :::{include} xpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
 ::::
 :::::
 ## Requirements
 - OS: Linux
 - Python: 3.9 -- 3.12
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 ```
 :::
 :::{tab-item} ROCm
 :sync: rocm
 ```{include} rocm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 ```
 :::
 :::{tab-item} XPU
 :sync: xpu
 ```{include} xpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 ```
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 :::{include} rocm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
 ::::
 ::::{tab-item} XPU
 :sync: xpu
 :::{include} xpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
 ::::
 :::::
 ## Set up using Python
 ### Create a new Python environment
-```{include} ../python_env_setup.inc.md
+:::{include} ../python_env_setup.inc.md
 ```
 ::::{tab-set}
 :sync-group: device
 :::{tab-item} CUDA
 :sync: cuda
 ```{include} cuda.inc.md
 :start-after: "## Create a new Python environment"
 :end-before: "### Pre-built wheels"
 ```
 :::
-:::{tab-item} ROCm
+:::::{tab-set}
 :sync-group: device
 ::::{tab-item} CUDA
 :sync: cuda
 :::{include} cuda.inc.md
 :start-after: "## Create a new Python environment"
 :end-before: "### Pre-built wheels"
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 There is no extra information on creating a new Python environment for this device.
-:::
+::::
-:::{tab-item} XPU
+::::{tab-item} XPU
 :sync: xpu
 There is no extra information on creating a new Python environment for this device.
 :::
 ::::
 :::::
 ### Pre-built wheels
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 ```
 :::
 :::{tab-item} ROCm
 :sync: rocm
 ```{include} rocm.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 ```
 :::
 :::{tab-item} XPU
 :sync: xpu
 ```{include} xpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 ```
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 :::{include} rocm.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
 ::::
 ::::{tab-item} XPU
 :sync: xpu
 :::{include} xpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
 ::::
 :::::
 (build-from-source)=
 ### Build wheel from source
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 :::{tab-item} ROCm
 :sync: rocm
 ```{include} rocm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 :::{tab-item} XPU
 :sync: xpu
 ```{include} xpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 ```
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 :::{include} rocm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
 ::::
 ::::{tab-item} XPU
 :sync: xpu
 :::{include} xpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
 ::::
 :::::
 ## Set up using Docker
 ### Pre-built images
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 ```
 :::
 :::{tab-item} ROCm
 :sync: rocm
 ```{include} rocm.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 ```
 :::
 :::{tab-item} XPU
 :sync: xpu
 ```{include} xpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 ```
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 :::{include} rocm.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
 ::::
 ::::{tab-item} XPU
 :sync: xpu
 :::{include} xpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
 ::::
 :::::
 ### Build image from source
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 ```
 :::
 :::{tab-item} ROCm
 :sync: rocm
 ```{include} rocm.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 ```
 :::
 :::{tab-item} XPU
 :sync: xpu
 ```{include} xpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 ```
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 :::{include} rocm.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 :::
 ::::
 ::::{tab-item} XPU
 :sync: xpu
 :::{include} xpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 :::
 ::::
 :::::
 ## Supported features
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Supported features"
 ```
 :::
 :::{tab-item} ROCm
 :sync: rocm
 ```{include} rocm.inc.md
 :start-after: "## Supported features"
 ```
 :::
 :::{tab-item} XPU
 :sync: xpu
 ```{include} xpu.inc.md
 :start-after: "## Supported features"
 ```
 :::
 ::::
 ::::{tab-item} ROCm
 :sync: rocm
 :::{include} rocm.inc.md
 :start-after: "## Supported features"
 :::
 ::::
 ::::{tab-item} XPU
 :sync: xpu
 :::{include} xpu.inc.md
 :start-after: "## Supported features"
 :::
 ::::
 :::::
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@ -16,10 +16,10 @@ Currently, there are no pre-built ROCm wheels.
 However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
 docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
-```{tip}
+:::{tip}
 Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
 for instructions on how to use this prebuilt docker image.
-```
+:::
 ### Build wheel from source
@ -47,9 +47,9 @@ for instructions on how to use this prebuilt docker image.
    cd ../..
    ```
-    ```{note}
+    :::{note}
-    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+    If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-    ```
+    :::
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
@ -67,9 +67,9 @@ for instructions on how to use this prebuilt docker image.
    cd ..
    ```
-    ```{note}
+    :::{note}
-    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+    You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-    ```
+    :::
 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
@ -95,17 +95,18 @@ for instructions on how to use this prebuilt docker image.
    This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
-    ```{tip}
+<!--- pyml disable-num-lines 5 ul-indent-->
    :::{tip}
    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
    - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-    ```
+    :::
-```{tip}
+:::{tip}
 - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-```
+:::
 ## Set up using Docker
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -30,10 +30,10 @@ pip install -v -r requirements-xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
-```{note}
+:::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
  type will be supported in the future.
-```
+:::
 ## Set up using Docker
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@ -4,10 +4,10 @@
 vLLM supports the following hardware platforms:
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 gpu/index
 cpu/index
 ai_accelerator/index
-```
+:::
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@ -6,9 +6,9 @@ conda create -n myenv python=3.12 -y
 conda activate myenv
 ```
-```{note}
+:::{note}
 [PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
-```
+:::
 Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -32,9 +32,9 @@ conda activate myenv
 pip install vllm
 ```
-```{note}
+:::{note}
 For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
-```
+:::
 (quickstart-offline)=
@ -69,9 +69,9 @@ The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](
 llm = LLM(model="facebook/opt-125m")
 ```
-```{note}
+:::{note}
 By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
-```
+:::
 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
@ -97,10 +97,10 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru
 vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
-```{note}
+:::{note}
 By default, the server uses a predefined chat template stored in the tokenizer.
 You can learn about overriding it [here](#chat-template).
-```
+:::
 This server can be queried in the same format as OpenAI API. For example, to list the models:
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@ -4,9 +4,9 @@
 This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
-```{note}
+:::{note}
 Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-```
+:::
 ## Hangs downloading a model
@ -18,9 +18,9 @@ It's recommended to download the model first using the [huggingface-cli](https:/
 If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
 It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
-```{note}
+:::{note}
 To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-```
+:::
 ## Out of memory
@ -132,14 +132,14 @@ If the script runs successfully, you should see the message `sanity check is suc
 If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
-```{note}
+:::{note}
 A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
 - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
 - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
-```
+:::
 (troubleshooting-python-multiprocessing)=
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -1,13 +1,13 @@
 # Welcome to vLLM
-```{figure} ./assets/logos/vllm-logo-text-light.png
+:::{figure} ./assets/logos/vllm-logo-text-light.png
 :align: center
 :alt: vLLM
 :class: no-scaled-link
 :width: 60%
-```
+:::
-```{raw} html
+:::{raw} html
 <p style="text-align:center">
 <strong>Easy, fast, and cheap LLM serving for everyone
 </strong>
@ -19,7 +19,7 @@
 <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
-```
+:::
 vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -58,7 +58,7 @@ For more information, check out the following:
 % How to start using vLLM?
-```{toctree}
+:::{toctree}
 :caption: Getting Started
 :maxdepth: 1
@ -67,11 +67,11 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
-```
+:::
 % What does vLLM support?
-```{toctree}
+:::{toctree}
 :caption: Models
 :maxdepth: 1
@ -79,27 +79,28 @@ models/generative_models
 models/pooling_models
 models/supported_models
 models/extensions/index
-```
+:::
 % Additional capabilities
-```{toctree}
+:::{toctree}
 :caption: Features
 :maxdepth: 1
 features/quantization/index
 features/lora
 features/tool_calling
 features/reasoning_outputs
 features/structured_outputs
 features/automatic_prefix_caching
 features/disagg_prefill
 features/spec_decode
 features/compatibility_matrix
-```
+:::
 % Details about running vLLM
-```{toctree}
+:::{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
@ -112,11 +113,11 @@ serving/engine_args
 serving/env_vars
 serving/usage_stats
 serving/integrations/index
-```
+:::
 % Scaling up vLLM for production
-```{toctree}
+:::{toctree}
 :caption: Deployment
 :maxdepth: 1
@ -125,21 +126,21 @@ deployment/k8s
 deployment/nginx
 deployment/frameworks/index
 deployment/integrations/index
-```
+:::
 % Making the most out of vLLM
-```{toctree}
+:::{toctree}
 :caption: Performance
 :maxdepth: 1
 performance/optimization
 performance/benchmarks
-```
+:::
 % Explanation of vLLM internals
-```{toctree}
+:::{toctree}
 :caption: Design Documents
 :maxdepth: 2
@ -150,11 +151,11 @@ design/kernel/paged_attention
 design/mm_processing
 design/automatic_prefix_caching
 design/multiprocessing
-```
+:::
 % How to contribute to the vLLM project
-```{toctree}
+:::{toctree}
 :caption: Developer Guide
 :maxdepth: 2
@ -163,11 +164,11 @@ contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
 contributing/vulnerability_management
-```
+:::
 % Technical API specifications
-```{toctree}
+:::{toctree}
 :caption: API Reference
 :maxdepth: 2
@ -176,17 +177,18 @@ api/engine/index
 api/inference_params
 api/multimodal/index
 api/model/index
-```
+:::
 % Latest news and acknowledgements
-```{toctree}
+:::{toctree}
 :caption: Community
 :maxdepth: 1
 community/blog
 community/meetups
 community/sponsors
-```
+:::
 ## Indices and tables
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@ -1,8 +1,8 @@
 # Built-in Extensions
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 runai_model_streamer
 tensorizer
-```
+:::
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@ -48,6 +48,6 @@ You can read further about CPU buffer memory limiting [here](https://github.com/
 vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
 ```
-```{note}
+:::{note}
 For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
-```
+:::
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@ -11,6 +11,6 @@ For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
 the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
-```{note}
+:::{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
-```
+:::
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@ -70,10 +70,10 @@ The {class}`~vllm.LLM.chat` method implements chat functionality on top of {clas
 In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
 and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
-```{important}
+:::{important}
 In general, only instruction-tuned models have a chat template.
 Base models may perform poorly as they are not trained to respond to the chat conversation.
-```
+:::
 ```python
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@ -8,54 +8,54 @@ In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmMo
 These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
-```{note}
+:::{note}
 We currently support pooling models primarily as a matter of convenience.
 As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
-```
+:::
 For pooling models, we support the following `--task` options.
 The selected option sets the default pooler used to extract the final hidden states:
-```{list-table}
+:::{list-table}
 :widths: 50 25 25 25
 :header-rows: 1
-* - Task
+- * Task
-  - Pooling Type
+  * Pooling Type
-  - Normalization
+  * Normalization
-  - Softmax
+  * Softmax
-* - Embedding (`embed`)
+- * Embedding (`embed`)
-  - `LAST`
+  * `LAST`
-  - ✅︎
+  * ✅︎
-  - ✗
+  * ✗
-* - Classification (`classify`)
+- * Classification (`classify`)
-  - `LAST`
+  * `LAST`
-  - ✗
+  * ✗
-  - ✅︎
+  * ✅︎
-* - Sentence Pair Scoring (`score`)
+- * Sentence Pair Scoring (`score`)
-  - \*
+  * \*
-  - \*
+  * \*
-  - \*
+  * \*
-* - Reward Modeling (`reward`)
+- * Reward Modeling (`reward`)
-  - `ALL`
+  * `ALL`
-  - ✗
+  * ✗
-  - ✗
+  * ✗
-```
+:::
 \*The default pooler is always defined by the model.
-```{note}
+:::{note}
 If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
-```
+:::
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
-```{tip}
+:::{tip}
 You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
-```
+:::
 ## Offline Inference
@ -111,10 +111,10 @@ The {class}`~vllm.LLM.score` method outputs similarity scores between sentence p
 It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
 These types of models serve as rerankers between candidate query-document pairs in RAG systems.
-```{note}
+:::{note}
 vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
 To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
-```
+:::
 ```python
 llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@ -14,9 +14,9 @@ In short, you should increase the number of GPUs and the number of nodes until y
 After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
-```{note}
+:::{note}
 There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
-```
+:::
 ## Running vLLM on a single node
@ -94,12 +94,12 @@ vllm serve /path/to/the/model/in/the/container \
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
-```{warning}
+:::{warning}
 After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
-```
+:::
-```{warning}
+:::{warning}
 Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
 When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
-```
+:::
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@ -4,6 +4,7 @@
 Below, you can find an explanation of every engine argument for vLLM:
 <!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
    :module: vllm.engine.arg_utils
@ -16,6 +17,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 Below are the additional arguments related to the asynchronous engine:
 <!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
    :module: vllm.engine.arg_utils
--- a/docs/source/serving/env_vars.md
+++ b/docs/source/serving/env_vars.md
@ -2,14 +2,14 @@
 vLLM uses the following environment variables to configure the system:
-```{warning}
+:::{warning}
 Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
 All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
-```
+:::
-```{literalinclude} ../../../vllm/envs.py
+:::{literalinclude} ../../../vllm/envs.py
 :end-before: end-env-vars-definition
 :language: python
 :start-after: begin-env-vars-definition
-```
+:::
--- a/docs/source/serving/integrations/index.md
+++ b/docs/source/serving/integrations/index.md
@ -1,8 +1,8 @@
 # External Integrations
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 langchain
 llamaindex
-```
+:::
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@ -31,8 +31,8 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I
 The following metrics are exposed:
-```{literalinclude} ../../../vllm/engine/metrics.py
+:::{literalinclude} ../../../vllm/engine/metrics.py
 :end-before: end-metrics-definitions
 :language: python
 :start-after: begin-metrics-definitions
-```
+:::
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@ -4,10 +4,10 @@
 This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
-```{note}
+:::{note}
 We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
 and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
-```
+:::
 ## Offline Inference
@ -203,13 +203,13 @@ for o in outputs:
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
-```{important}
+:::{important}
 A chat template is **required** to use Chat Completions API.
 Although most models come with a chat template, for others you have to define one yourself.
 The chat template can be inferred based on the documentation on the model's HuggingFace repo.
 For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
-```
+:::
 ### Image
@ -273,24 +273,25 @@ print("Chat completion output:", chat_response.choices[0].message.content)
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
-```{tip}
+:::{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
 and pass the file path as `url` in the API request.
-```
+:::
-```{tip}
+:::{tip}
 There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
 In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
-```
+:::
-````{note}
+:::{note}
 By default, the timeout for fetching images through HTTP URL is `5` seconds.
 You can override this by setting the environment variable:
 ```console
-$ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 ```
-````
+
 :::
 ### Video
@ -345,14 +346,15 @@ print("Chat completion output from image url:", result)
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
-````{note}
+:::{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
 You can override this by setting the environment variable:
 ```console
-$ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
 :::
 ### Audio
@ -448,24 +450,25 @@ print("Chat completion output from audio url:", result)
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
-````{note}
+:::{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
 You can override this by setting the environment variable:
 ```console
-$ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
 :::
 ### Embedding
 vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
 where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
-```{tip}
+:::{tip}
 The schema of `messages` is exactly the same as in Chat Completions API.
 You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
-```
+:::
 Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
 Refer to the examples below for illustration.
@ -477,13 +480,13 @@ vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
  --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 ```
-```{important}
+:::{important}
 Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
 to run this model in embedding mode instead of text generation mode.
 The custom chat template is completely different from the original one for this model,
 and can be found here: <gh-file:examples/template_vlm2vec.jinja>
-```
+:::
 Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
@ -518,16 +521,16 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
  --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 ```
-```{important}
+:::{important}
 Like with VLM2Vec, we have to explicitly pass `--task embed`.
 Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
 by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
-```
+:::
-```{important}
+:::{important}
 Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
 example below for details.
-```
+:::
 Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@ -22,9 +22,9 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.
-```{seealso}
+:::{seealso}
 [API Reference](/api/offline_inference/index)
-```
+:::
 ## Configuration Options
@ -70,12 +70,12 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
          tensor_parallel_size=2)
 ```
-```{important}
+:::{important}
 To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
 before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
-```
+:::
 #### Quantization
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -50,6 +50,11 @@ In addition, we have the following custom APIs:
  - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 - [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
  - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
  - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
  - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 (chat-template)=
@ -156,11 +161,11 @@ print(completion._request_id)
 The `vllm serve` command is used to launch the OpenAI-compatible server.
-```{argparse}
+:::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
 :prog: vllm serve
-```
+:::
 #### Configuration file
@ -183,10 +188,10 @@ To use the above config file:
 vllm serve SOME_MODEL --config config.yaml
 ```
-```{note}
+:::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
-```
+:::
 ## API Reference
@ -203,19 +208,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 The following [sampling parameters](#sampling-params) are supported.
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-sampling-params
 :end-before: end-completion-sampling-params
-```
+:::
 The following extra parameters are supported:
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-extra-params
 :end-before: end-completion-extra-params
-```
+:::
 (chat-api)=
@ -235,19 +240,19 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 The following [sampling parameters](#sampling-params) are supported.
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-sampling-params
 :end-before: end-chat-completion-sampling-params
-```
+:::
 The following extra parameters are supported:
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-extra-params
 :end-before: end-chat-completion-extra-params
-```
+:::
 (embeddings-api)=
@ -259,9 +264,9 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model.
-```{tip}
+:::{tip}
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
-```
+:::
 Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
@ -269,27 +274,27 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 The following [pooling parameters](#pooling-params) are supported.
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-pooling-params
 :end-before: end-embedding-pooling-params
-```
+:::
 The following extra parameters are supported by default:
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-extra-params
 :end-before: end-embedding-extra-params
-```
+:::
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-embedding-extra-params
 :end-before: end-chat-embedding-extra-params
-```
+:::
 (tokenizer-api)=
@ -460,16 +465,103 @@ Response:
 The following [pooling parameters](#pooling-params) are supported.
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-pooling-params
 :end-before: end-score-pooling-params
-```
+:::
 The following extra parameters are supported:
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-extra-params
 :end-before: end-score-extra-params
 :::
 (rerank-api)=
 ### Re-rank API
 Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and
 each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
 a scale of 0 to 1.
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
 `score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
 endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.
 Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
 #### Example Request
 Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
 Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
 Request:
 ```bash
 curl -X 'POST' \
  'http://127.0.0.1:8000/v1/rerank' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "model": "BAAI/bge-reranker-base",
  "query": "What is the capital of France?",
  "documents": [
    "The capital of Brazil is Brasilia.",
    "The capital of France is Paris.",
    "Horses and cows are both animals"
  ]
 }'
 ```
 Response:
 ```bash
 {
  "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
  "model": "BAAI/bge-reranker-base",
  "usage": {
    "total_tokens": 56
  },
  "results": [
    {
      "index": 1,
      "document": {
        "text": "The capital of France is Paris."
      },
      "relevance_score": 0.99853515625
    },
    {
      "index": 0,
      "document": {
        "text": "The capital of Brazil is Brasilia."
      },
      "relevance_score": 0.0005860328674316406
    }
  ]
 }
 ```
 #### Extra parameters
 The following [pooling parameters](#pooling-params) are supported.
 :::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-rerank-pooling-params
 :end-before: end-rerank-pooling-params
 :::
 The following extra parameters are supported:
 :::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-rerank-extra-params
 :end-before: end-rerank-extra-params
 :::
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -67,7 +67,37 @@ def run_qwen2_audio(question: str, audio_count: int):
    return llm, prompt, stop_token_ids
-model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
+def run_minicpmo(question: str, audio_count: int):
    model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    llm = LLM(model=model_name,
              trust_remote_code=True,
              max_model_len=4096,
              max_num_seqs=5,
              limit_mm_per_prompt={"audio": audio_count})
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    audio_placeholder = "(<audio>./</audio>)" * audio_count
    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
    messages = [{
        'role': 'user',
        'content': f'{audio_placeholder}\n{question}'
    }]
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True,
                                           chat_template=audio_chat_template)
    return llm, prompt, stop_token_ids
 model_example_map = {
    "ultravox": run_ultravox,
    "qwen2_audio": run_qwen2_audio,
    "minicpmo": run_minicpmo
 }
 def main(args):
--- a/examples/offline_inference/basic.py
+++ b/examples/offline_inference/basic.py
@ -8,10 +8,10 @@ prompts = [
    "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams()  #temperature=0.8, top_p=0.95)
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
-llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", max_model_len=512, max_num_seqs=16)
+llm = LLM(model="facebook/opt-125m")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
@ -19,4 +19,4 @@ outputs = llm.generate(prompts, sampling_params)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/openai/openai_batch.md
+++ b/examples/offline_inference/openai/openai_batch.md
@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines.
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
 ```{note}
-We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
 ```
 ## Pre-requisites
@ -203,3 +203,34 @@ $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
 ...
 ```
 ## Example 5: Using score endpoint
 ### Additional prerequisites
 * Ensure you are using `vllm >= 0.7.0`.
 ### Step 1: Create your batch file
 Add score requests to your batch file. The following is an example:
 ```
 {"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 ```
 You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
 ### Step 2: Run the batch
 You can run the batch using the same command as in earlier examples.
 ### Step 3: Check your results
 You can check your results by running `cat results.jsonl`
 ```
 $ cat results.jsonl
 {"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
 {"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
 ```
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@ -0,0 +1,67 @@
 # vLLM TPU Profiling
 This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
 Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
 We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html).
 > In all examples below, we run several warmups before (so `--enforce-eager` is okay)
 ## Profile Examples
 ### Generate Prefill Trace
 This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
 ```bash
 export XLA_HLO_DEBUG=1
 export MODEL=Qwen/Qwen2.5-7B-Instruct
 export VLLM_TPU_PROFILE_DURATION_MS=3000
 export VLLM_TPU_PROFILE_DELAY_MS=0
 python3 profiling.py \
    --model $MODEL \
    --input-len 1024 --output-len 1 \
    --batch-size 1 --enforce-eager \
    --max-model-len 2048 \
    --tensor-parallel-size 1 \
    --profile-result-dir profiles
 ```
 ### Generate Decode Trace
 This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
 ```bash
 export XLA_HLO_DEBUG=1
 export MODEL=meta-llama/Llama-3.1-70B-Instruct
 export VLLM_TPU_PROFILE_DURATION_MS=2000
 export VLLM_TPU_PROFILE_DELAY_MS=1000
 rm -rf ~/.cache/vllm/xla_cache
 python3 profiling.py \
    --model $MODEL \
    --input-len 1 \
    --output-len 128 \
    --batch-size 32 \
    --enforce-eager \
    --profile-result-dir profiles \
    --max-model-len 2048 --tensor-parallel-size 8
 ```
 ## Visualizing the profiles
 Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
 Here are most likely the dependencies you need to install:
 ```bash
 pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
 ```
 Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
 ```bash
 tensorboard --logdir profiles/ --port 6006
 ```
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@ -0,0 +1,101 @@
 import argparse
 import dataclasses
 import os
 import time
 from typing import List
 import numpy as np
 import torch_xla.debug.profiler as xp
 from tqdm import tqdm
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
 from vllm.utils import FlexibleArgumentParser
 DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
 DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
 def main(args: argparse.Namespace):
    print(args)
    engine_args = EngineArgs.from_cli_args(args)
    llm = LLM(**dataclasses.asdict(engine_args))
    _ = xp.start_server(9012)
    sampling_params = SamplingParams(
        temperature=0.0,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
    print(sampling_params)
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
    dummy_prompts: List[PromptType] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]
    def run_to_completion():
        start_time = time.perf_counter()
        llm.generate(dummy_prompts,
                     sampling_params=sampling_params,
                     use_tqdm=False)
        end_time = time.perf_counter()
        latency = end_time - start_time
        return latency
    # Warmup
    print("Warming up...")
    warmup_latencies = []
    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
        warmup_latencies.append(run_to_completion())
    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
    # Profile
    profile_dir = args.profile_result_dir
    print(f"Profiling (results will be saved to '{profile_dir}')...")
    # Enable tracing on server
    xp.trace_detached("localhost:9012",
                      profile_dir,
                      delay_ms=DELAY_MS,
                      duration_ms=DURATION_MS)
    if DELAY_MS == 0:
        time.sleep(1.0)
    profile_latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
        profile_latencies.append(run_to_completion())
    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
    return
 if __name__ == '__main__':
    parser = FlexibleArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
    parser.add_argument('--num-iters-warmup',
                        type=int,
                        default=5,
                        help='Number of iterations to run for warmup.')
    parser.add_argument('--num-iters',
                        type=int,
                        default=1,
                        help='Number of iterations to run for profiling.')
    parser.add_argument(
        '--profile-result-dir',
        type=str,
        default="profiles",
        help=
        ('path to save the pytorch profiler output. Can be visualized '
         'with ui.perfetto.dev or Tensorboard '
         '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).'
         ))
    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -265,8 +265,9 @@ def run_mantis(question: str, modality: str):
 # MiniCPM-V
-def run_minicpmv(question: str, modality: str):
+def run_minicpmv_base(question: str, modality: str, model_name):
-    assert modality == "image"
+    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
@ -277,7 +278,15 @@ def run_minicpmv(question: str, modality: str):
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
    # 2.6
-    model_name = "openbmb/MiniCPM-V-2_6"
+    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6
    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    llm = LLM(
@ -294,13 +303,18 @@ def run_minicpmv(question: str, modality: str):
    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-    # 2.6
+    # 2.6 / o2.6
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }
    messages = [{
        'role': 'user',
-        'content': f'(<image>./</image>)\n{question}'
+        'content': f'{modality_placeholder[modality]}\n{question}'
    }]
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
@ -308,6 +322,14 @@ def run_minicpmv(question: str, modality: str):
    return llm, prompt, stop_token_ids
 def run_minicpmo(question: str, modality: str):
    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
 def run_minicpmv(question: str, modality: str):
    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
 # LLama 3.2
 def run_mllama(question: str, modality: str):
    assert modality == "image"
@ -523,6 +545,7 @@ model_example_map = {
    "llava-next-video": run_llava_next_video,
    "llava-onevision": run_llava_onevision,
    "mantis": run_mantis,
    "minicpmo": run_minicpmo,
    "minicpmv": run_minicpmv,
    "mllama": run_mllama,
    "molmo": run_molmo,
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@ -0,0 +1,32 @@
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
 run: vllm serve BAAI/bge-reranker-base
 """
 import cohere
 # cohere v1 client
 co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
 rerank_v1_result = co.rerank(
    model="BAAI/bge-reranker-base",
    query="What is the capital of France?",
    documents=[
        "The capital of France is Paris", "Reranking is fun!",
        "vLLM is an open-source framework for fast AI serving"
    ])
 print(rerank_v1_result)
 # or the v2
 co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
 v2_rerank_result = co2.rerank(
    model="BAAI/bge-reranker-base",
    query="What is the capital of France?",
    documents=[
        "The capital of France is Paris", "Reranking is fun!",
        "vLLM is an open-source framework for fast AI serving"
    ])
 print(v2_rerank_result)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alexander Matveev	0a02744dc8	fix TP	2025-01-31 01:18:56 +00:00
Alexander Matveev	984ffddda6	add cuda graph support to triton_mla attention	2025-01-30 21:12:00 +00:00
Lucas Wilkinson	135c404fbb	review comments Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-30 15:11:58 +00:00
Lucas Wilkinson	7241acbd64	review comments Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-30 14:09:46 +00:00
Lucas Wilkinson	2b140debbb	Update vllm/config.py Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>	2025-01-30 08:51:42 -05:00
Lucas Wilkinson	2326814c11	renaming for consistency Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-30 04:00:26 +00:00
Lucas Wilkinson	534cd0006d	review comments Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-30 03:52:59 +00:00
Lucas Wilkinson	aa19f297d2	Update vllm/attention/backends/mla/utils.py Co-authored-by: Michael Goin <mgoin64@gmail.com>	2025-01-29 22:51:37 -05:00
Lucas Wilkinson	4880a43d20	Update utils.py Co-authored-by: Michael Goin <mgoin64@gmail.com>	2025-01-29 22:46:43 -05:00
Lucas Wilkinson	3895bba85a	more cleanups Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-30 03:18:46 +00:00
Lucas Wilkinson	f23d126a07	fix VLLM_MLA_PERFORM_MATRIX_ABSORPTION=0 Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-30 03:18:46 +00:00
Lucas Wilkinson	ec8c1cf732	squashed commits Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-30 03:18:46 +00:00
Mark McLoughlin	f17f1d4608	[V1][Metrics] Add GPU cache usage % gauge (#12561 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-01-29 18:31:01 -08:00
Divakar Verma	1c1bb0bbf2	[Misc][MoE] add Deepseek-V3 moe tuning support (#12558 ) Signed-off-by: Divakar Verma <divakar.verma@amd.com>	2025-01-30 00:47:30 +00:00
Woosuk Kwon	e0cc5f259a	[V1][BugFix] Free encoder cache for aborted requests (#12545 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-29 13:47:33 -08:00
Tyler Michael Smith	73aa6cfdf7	Revert "[Build/CI] Fix libcuda.so linkage" (#12552 )	2025-01-29 21:12:24 +00:00
Jinzhen Lin	27b78c73ca	[Kernel] add triton fused moe kernel for gptq/awq (#12185 )	2025-01-29 09:07:09 -05:00
Pavani Majety	b02fd288b2	[Hardware][NV] Fix Modelopt model loading for k-v-scales for Llama models. (#11787 ) Signed-off-by: Pavani Majety <pmajety@nvidia.com> Co-authored-by: mgoin <michael@neuralmagic.com>	2025-01-29 01:46:12 -08:00
Yanyi Liu	ff7424f491	[Frontend] Support override generation config in args (#12409 ) Signed-off-by: liuyanyi <wolfsonliu@163.com>	2025-01-29 01:41:01 -08:00
Alphi	d93bf4da85	[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069 ) Signed-off-by: hzh <hezhihui_thu@163.com> Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com> Signed-off-by: Akshat Tripathi <akshat@krai.ai> Signed-off-by: Oleg Mosalov <oleg@krai.ai> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu> Signed-off-by: Chenguang Li <757486878@qq.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Shanshan Shen <467638484@qq.com> Signed-off-by: elijah <f1renze.142857@gmail.com> Signed-off-by: Yikun <yikunkero@gmail.com> Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Konrad Zawora <kzawora@habana.ai> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Rui Qiao <ruisearch42@gmail.com> Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com> Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: sixgod <evethwillbeok@outlook.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com> Co-authored-by: Oleg Mosalov <oleg@krai.ai> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com> Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com> Co-authored-by: Concurrensee <yida.wu@amd.com> Co-authored-by: Chenguang Li <757486878@qq.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Alex Brooks <alex.brooks@ibm.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Shanshan Shen <467638484@qq.com> Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com> Co-authored-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Konrad Zawora <kzawora@habana.ai> Co-authored-by: TJian <tunjian1996@gmail.com> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com> Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com> Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2025-01-29 09:24:59 +00:00
Travis Johnson	036ca94c25	[Bugfix] handle alignment of arguments in convert_sparse_cross_attention_mask_to_dense (#12347 ) Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: Wallas Santos <wallashss@ibm.com>	2025-01-29 08:54:35 +00:00
Maximilien de Bayser	ef001d98ef	Fix the pydantic logging validator (#12420 ) Signed-off-by: Max de Bayser <mbayser@br.ibm.com>	2025-01-29 07:53:13 +00:00
Robert Shaw	5f671cb4c3	[V1] Improve Error Message for Unsupported Config (#12535 ) Co-authored-by: Michael Goin <michael@neuralmagic.com>	2025-01-29 04:56:56 +00:00
Michael Goin	bd02164cf9	Bugfix for whisper quantization due to fake k_proj bias (#12524 ) Signed-off-by: mgoin <michael@neuralmagic.com>	2025-01-29 04:49:03 +00:00
Mark McLoughlin	46fb056749	[V1][Metrics] Add TTFT and TPOT histograms (#12530 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-01-29 04:11:16 +00:00
Harry Mellor	dd6a3a02cb	[Doc] Convert docs to use colon fences (#12471 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-01-29 11:38:29 +08:00
Ce Gao	a7e3eba66f	[Frontend] Support reasoning content for deepseek r1 (#12473 ) Signed-off-by: Ce Gao <cegao@tensorchord.ai> Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin@redhat.com>	2025-01-29 11:38:08 +08:00
Michael Goin	fbb5bd4cef	[TPU] Add example for profiling TPU inference (#12531 ) Signed-off-by: mgoin <mgoin@redhat.com>	2025-01-29 03:16:47 +00:00
fenghuizhang	80fcc3ed1c	[Kernel] Pipe attn_logits_soft_cap through paged attention TPU kernels (#12482 ) Signed-off-by: Fenghui Zhang <fhzhang@google.com>	2025-01-28 22:36:44 +00:00
Mark McLoughlin	c386c43ca3	[V1][Metrics] Add per-request prompt/generation_tokens histograms (#12516 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-01-28 22:07:22 +00:00
Harry Mellor	f26d790718	Do not run `suggestion` `pre-commit` hook multiple times (#12521 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-01-28 20:05:27 +00:00
Michael Goin	0f657bdc52	Replace missed warning_once for rerank API (#12472 ) Signed-off-by: mgoin <michael@neuralmagic.com>	2025-01-28 19:06:32 +00:00
Mark McLoughlin	3fd1fb63ef	[V1][Metrics] Hook up IterationStats for Prometheus metrics (#12478 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-01-28 16:38:38 +00:00
Jun Duan	925d2f1908	[Doc] Fix typo for x86 CPU installation (#12514 ) Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>	2025-01-28 16:37:10 +00:00
Cyrus Leung	8f58a51358	[VLM] Merged multi-modal processor and V1 support for Qwen-VL (#12504 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-28 16:25:05 +00:00
Sebastian Schoennenbeck	2079e43bee	[Core] Make raw_request optional in ServingCompletion (#12503 ) Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>	2025-01-28 10:56:45 +00:00
Robert Shaw	e29d4358ef	[V1] Include Engine Version in Logs (#12496 ) Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>	2025-01-28 08:27:41 +00:00
Roger Wang	8cbc424975	Update README.md with V1 alpha release (#12495 )	2025-01-28 08:22:41 +00:00
Mengqing Cao	dd66fd2b01	[CI] fix pre-commit error (#12494 ) Signed-off-by: Mengqing Cao <cmq0113@163.com>	2025-01-28 06:11:05 +00:00
Gabriel Marinho	0f465ab533	[FEATURE] Enables offline /score for embedding models (#12021 ) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>	2025-01-28 11:30:13 +08:00
Hossein Sarshar	23a7cbc88b	[CI/Build] Fixed the xla nightly issue report in #12451 (#12453 )	2025-01-28 11:18:07 +08:00
Michael Goin	426a5c3625	Fix bad path in prometheus example (#12481 ) Signed-off-by: mgoin <michael@neuralmagic.com>	2025-01-27 18:56:31 -07:00
Liangfu Chen	ddee88d0ff	[Neuron][Kernel] NKI-based flash-attention kernel with paged KV cache (#11277 ) Signed-off-by: Liangfu Chen <liangfc@amazon.com> Co-authored-by: Jiangfei Duan <jfduan@outlook.com>	2025-01-27 17:31:16 -08:00
Harry Mellor	823ab79633	Update `pre-commit` hooks (#12475 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-01-27 17:23:08 -07:00
Nicolò Lucchesi	6116ca8cd7	[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and `prompt_logprobs` with ChunkedPrefill (#10132 ) Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: wallashss <wallashss@ibm.com> Co-authored-by: wallashss <wallashss@ibm.com>	2025-01-27 13:38:35 -08:00
Bowen Wang	2bc3fbba0c	[FlashInfer] Upgrade to 0.2.0 (#11194 ) Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>	2025-01-27 18:19:24 +00:00
Woosuk Kwon	3f1fc7425a	[V1][CI/Test] Do basic test for top-p & top-k sampling (#12469 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-27 09:40:04 -08:00
Mark McLoughlin	01ba927040	[V1][Metrics] Add initial Prometheus logger (#12416 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-01-27 12:26:28 -05:00
Lucas Wilkinson	103bd17ac5	[Build] Only build 9.0a for scaled_mm and sparse kernels (#12339 ) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-27 10:40:00 -05:00
Isotr0py	ce69f7f754	[Bugfix] Fix gpt2 GGUF inference (#12467 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-01-27 18:31:49 +08:00
Woosuk Kwon	624a1e4711	[V1][Minor] Minor optimizations for update_from_output (#12454 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-27 01:09:27 -08:00
Isotr0py	372bf0890b	[Bugfix] Fix missing seq_start_loc in xformers prefill metadata (#12464 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-01-27 07:25:30 +00:00
Cyrus Leung	5204ff5c3f	[Bugfix] Fix Granite 3.0 MoE model loading (#12446 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-26 21:26:44 -08:00
Pooya Davoodi	0cc6b383d7	[Frontend] Support scores endpoint in run_batch (#12430 ) Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>	2025-01-27 04:30:17 +00:00
Woosuk Kwon	28e0750847	[V1] Avoid list creation in input preparation (#12457 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-26 19:57:56 -08:00
Yuan Tang	582cf78798	[DOC] Add link to vLLM blog (#12460 ) Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>	2025-01-27 03:46:19 +00:00
Kyle Mistele	0034b09ceb	[Frontend] Rerank API (Jina- and Cohere-compatible API) (#12376 ) Signed-off-by: Kyle Mistele <kyle@mistele.com>	2025-01-26 19:58:45 -07:00
Tyler Michael Smith	72bac73067	[Build/CI] Fix libcuda.so linkage (#12424 ) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-01-26 21:18:19 +00:00
Lucas Wilkinson	68f11149d8	[Bugfix][Kernel] Fix perf regression caused by PR #12405 (#12434 ) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-26 11:09:34 -08:00
Tyler Michael Smith	72f4880425	[Bugfix/CI] Fix broken kernels/test_mha.py (#12450 )	2025-01-26 10:39:03 -08:00
Tyler Michael Smith	aa2cd2c43d	[Bugfix] Disable w16a16 2of4 sparse CompressedTensors24 (#12417 ) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com>	2025-01-26 19:59:58 +08:00
Matthew Hendrey	9ddc35220b	[Frontend] generation_config.json for maximum tokens(#12242 ) Signed-off-by: Matthew Hendrey <matthew.hendrey@gmail.com> Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Yuan Tang <terrytangyuan@gmail.com> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: shangmingc <caishangming@linux.alibaba.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>	2025-01-26 19:59:25 +08:00
Roger Wang	a5255270c3	[Misc] Revert FA on ViT #12355 and #12435 (#12445 )	2025-01-26 03:56:34 -08:00
Roger Wang	0ee349b553	[V1][Bugfix] Fix assertion when mm hashing is turned off (#12439 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-01-26 00:47:42 -08:00
Keyun Tong	fa63e710c7	[V1][Perf] Reduce scheduling overhead in model runner after cuda sync (#12094 ) Signed-off-by: Keyun Tong <tongkeyun@gmail.com>	2025-01-26 00:42:37 -08:00
Roger Wang	2a0309a646	[Misc][Bugfix] FA3 support to ViT MHA layer (#12435 ) Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>	2025-01-26 05:00:31 +00:00
Siyuan Liu	324960a95c	[TPU][CI] Update torchxla version in requirement-tpu.txt (#12422 ) Signed-off-by: Siyuan Liu <lsiyuan@google.com>	2025-01-25 07:23:03 +00:00
Isotr0py	f1fc0510df	[Misc] Add FA2 support to ViT MHA layer (#12355 ) Signed-off-by: Isotr0py <2037008807@qq.com>	2025-01-25 15:07:35 +08:00
Divakar Verma	bf21481dde	[ROCm][MoE] MI300 tuned configs Mixtral-8x(7B,22B) \| fp16, fp8 (#12408 ) Signed-off-by: Divakar Verma <divakar.verma@amd.com>	2025-01-25 12:17:19 +08:00
Cyrus Leung	fb30ee92ee	[Bugfix] Fix BLIP-2 processing (#12412 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-25 11:42:42 +08:00
ElizaWszola	221d388cc5	[Bugfix][Kernel] Fix moe align block issue for mixtral (#12413 )	2025-01-25 01:49:28 +00:00
Lucas Wilkinson	3132a933b6	[Bugfix][Kernel] FA3 Fix - RuntimeError: This flash attention build only supports pack_gqa (for build size reasons). (#12405 ) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-24 20:20:59 +00:00
Cyrus Leung	df5dafaa5b	[Misc] Remove deprecated code (#12383 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-01-24 14:45:20 -05:00
Lucas Wilkinson	ab5bbf5ae3	[Bugfix][Kernel] Fix CUDA 11.8 being broken by FA3 build (#12375 ) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-01-24 15:27:59 +00:00
Junichi Sato	3bb8e2c9a2	[Misc] Enable proxy support in benchmark script (#12356 ) Signed-off-by: Junichi Sato <junichi.sato@sbintuitions.co.jp>	2025-01-24 14:58:26 +00:00
youkaichao	e784c6b998	[ci/build] sync default value for wheel size (#12398 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-01-24 17:54:29 +08:00
Mohit Deopujari	9a0f3bdbe5	[Hardware][Gaudi][Doc] Add missing step in setup instructions (#12382 )	2025-01-24 09:43:49 +00:00
youkaichao	c7c9851036	[ci/build] fix wheel size check (#12396 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-01-24 17:31:25 +08:00
Roger Wang	3c818bdb42	[Misc] Use VisionArena Dataset for VLM Benchmarking (#12389 ) Signed-off-by: Roger Wang <ywang@roblox.com>	2025-01-24 00:22:04 -08:00
youkaichao	6dd94dbe94	[perf] fix perf regression from #12253 (#12380 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-01-24 11:34:27 +08:00
Woosuk Kwon	0e74d797ce	[V1] Increase default batch size for H100/H200 (#12369 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-01-24 03:19:55 +00:00
Dipika Sikka	55ef66edf4	Update compressed-tensors version (#12367 )	2025-01-24 11:19:42 +08:00
omer-dayan	5e5630a478	[Bugfix] Path join when building local path for S3 clone (#12353 ) Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>	2025-01-24 11:06:07 +08:00
Russell Bryant	d3d6bb13fb	Set weights_only=True when using torch.load() (#12366 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-01-24 02:17:30 +00:00
		`@ -0,0 +1,3 @@`
							`# vLLM Blog`

							`vLLM blog posts are published [here](https://blog.vllm.ai/).`