Migrate docs from Sphinx to MkDocs (#18145)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 11:09:53 +02:00
parent d0bc2f810b
commit a1fe24d961
218 changed files with 4126 additions and 6790 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -33,14 +33,13 @@ steps:
 - label: Documentation Build # 2min
  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/test_docs/docs"
+  working_dir: "/vllm-workspace/test_docs"
  fast_check: true
  no_gpu: True
  commands:
-  - pip install -r ../../requirements/docs.txt
+  - pip install -r ../requirements/docs.txt
-  - SPHINXOPTS=\"-W\" make html
+  # TODO: add `--strict` once warnings in docstrings are fixed
-  # Check API reference (if it fails, you may have missing mock imports)
+  - mkdocs build
  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  mirror_hardwares: [amdexperimental]
--- a/.gitignore
+++ b/.gitignore
@ -77,11 +77,6 @@ instance/
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
 docs/source/api/vllm
 # PyBuilder
 .pybuilder/
 target/
@ -151,6 +146,7 @@ venv.bak/
 # mkdocs documentation
 /site
 docs/getting_started/examples
 # mypy
 .mypy_cache/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -39,6 +39,7 @@ repos:
  rev: v0.9.29
  hooks:
  - id: pymarkdown
    exclude: '.*\.inc\.md'
    args: [fix]
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -8,12 +8,8 @@ build:
  tools:
    python: "3.12"
-sphinx:
+mkdocs:
-  configuration: docs/source/conf.py
+  configuration: mkdocs.yaml
  fail_on_warning: true
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -329,7 +329,9 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 # will not be imported by other tests
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN cp -r examples test_docs/
 RUN mv vllm test_docs/
 RUN mv mkdocs.yaml test_docs/
 #################### TEST IMAGE ####################
 #################### OPENAI API SERVER ####################
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@ -0,0 +1,51 @@
 nav:
  - Home: 
    - vLLM: README.md
    - Getting Started:
      - getting_started/quickstart.md
      - getting_started/installation
    - Examples:
      - LMCache: getting_started/examples/lmcache
      - getting_started/examples/offline_inference
      - getting_started/examples/online_serving
      - getting_started/examples/other
    - Roadmap: https://roadmap.vllm.ai
    - Releases: https://github.com/vllm-project/vllm/releases
  - User Guide:
    - Inference and Serving:
      - serving/offline_inference.md
      - serving/openai_compatible_server.md
      - serving/*
      - serving/integrations
    - Training: training
    - Deployment:
      - deployment/*
      - deployment/frameworks
      - deployment/integrations
    - Performance: performance
    - Models:
      - models/supported_models.md
      - models/generative_models.md
      - models/pooling_models.md
      - models/extensions
    - Features:
      - features/compatibility_matrix.md
      - features/*
      - features/quantization
    - Other:
      - getting_started/*
  - Developer Guide:
    - contributing/overview.md
    - glob: contributing/*
      flatten_single_child_sections: true
    - contributing/model
    - Design Documents:
      - V0: design
      - V1: design/v1
  - API Reference:
    - api/README.md
    - glob: api/vllm/*
      preserve_directory_names: true
  - Community:
    - community/*
    - vLLM Blog: https://blog.vllm.ai
--- a/docs/Makefile
+++ b/docs/Makefile
@ -1,25 +0,0 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 clean:
 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 	rm -rf "$(SOURCEDIR)/getting_started/examples"
 	rm -rf "$(SOURCEDIR)/api/vllm"
--- a/docs/README.md
+++ b/docs/README.md
@ -1,43 +1,50 @@
-# vLLM documents
+# Welcome to vLLM
-## Build the docs
+<figure markdown="span">
  ![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM" class="no-scaled-link" width="60%" }
 </figure>
- Make sure in `docs` directory
+<p style="text-align:center">
 <strong>Easy, fast, and cheap LLM serving for everyone
 </strong>
 </p>
-```bash
+<p style="text-align:center">
-cd docs
+<script async defer src="https://buttons.github.io/buttons.js"></script>
-```
+<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
- Install the dependencies:
+vLLM is a fast and easy-to-use library for LLM inference and serving.
-```bash
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 pip install -r ../requirements/docs.txt
 ```
- Clean the previous build (optional but recommended):
+vLLM is fast with:
-```bash
+- State-of-the-art serving throughput
-make clean
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
-```
+- Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
 - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
 - Speculative decoding
 - Chunked prefill
- Generate the HTML documentation:
+vLLM is flexible and easy to use with:
-```bash
+- Seamless integration with popular HuggingFace models
-make html
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-```
+- Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 - Prefix caching support
 - Multi-lora support
-## Open the docs with your browser
+For more information, check out the following:
- Serve the documentation locally:
+- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
-
+- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
-```bash
+- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
-python -m http.server -d build/html/
+- [vLLM Meetups][meetups]
 ```
 This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
 If port 8000 is already in use, you can specify a different port, for example:
 ```bash
 python -m http.server 3000 -d build/html/
 ```
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -0,0 +1,107 @@
 # Summary
 [](){ #configuration }
 ## Configuration
 API documentation for vLLM's configuration classes.
 - [vllm.config.ModelConfig][]
 - [vllm.config.CacheConfig][]
 - [vllm.config.TokenizerPoolConfig][]
 - [vllm.config.LoadConfig][]
 - [vllm.config.ParallelConfig][]
 - [vllm.config.SchedulerConfig][]
 - [vllm.config.DeviceConfig][]
 - [vllm.config.SpeculativeConfig][]
 - [vllm.config.LoRAConfig][]
 - [vllm.config.PromptAdapterConfig][]
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
 - [vllm.config.DecodingConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
 - [vllm.config.VllmConfig][]
 [](){ #offline-inference-api }
 ## Offline Inference
 LLM Class.
 - [vllm.LLM][]
 LLM Inputs.
 - [vllm.inputs.PromptType][]
 - [vllm.inputs.TextPrompt][]
 - [vllm.inputs.TokensPrompt][]
 ## vLLM Engines
 Engine classes for offline and online inference.
 - [vllm.LLMEngine][]
 - [vllm.AsyncLLMEngine][]
 ## Inference Parameters
 Inference parameters for vLLM APIs.
 [](){ #sampling-params }
 [](){ #pooling-params }
 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]
 [](){ #multi-modality }
 ## Multi-Modality
 vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
 Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
 via the `multi_modal_data` field in [vllm.inputs.PromptType][].
 Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal].
 - [vllm.multimodal.MULTIMODAL_REGISTRY][]
 ### Inputs
 User-facing inputs.
 - [vllm.multimodal.inputs.MultiModalDataDict][]
 Internal data structures.
 - [vllm.multimodal.inputs.PlaceholderRange][]
 - [vllm.multimodal.inputs.NestedTensors][]
 - [vllm.multimodal.inputs.MultiModalFieldElem][]
 - [vllm.multimodal.inputs.MultiModalFieldConfig][]
 - [vllm.multimodal.inputs.MultiModalKwargsItem][]
 - [vllm.multimodal.inputs.MultiModalKwargs][]
 - [vllm.multimodal.inputs.MultiModalInputs][]
 ### Data Parsing
 - [vllm.multimodal.parse][]
 ### Data Processing
 - [vllm.multimodal.processing][]
 ### Memory Profiling
 - [vllm.multimodal.profiling][]
 ### Registry
 - [vllm.multimodal.registry][]
 ## Model Development
 - [vllm.model_executor.models.interfaces_base][]
 - [vllm.model_executor.models.interfaces][]
 - [vllm.model_executor.models.adapters][]
--- a/docs/api/vllm/.meta.yml
+++ b/docs/api/vllm/.meta.yml
@ -0,0 +1,2 @@
 search:
  boost: 0.5
--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/source/assets/deployment/anything-llm-chat-with-doc.png
+++ b/docs/source/assets/deployment/anything-llm-chat-with-doc.png
--- a/docs/source/assets/deployment/anything-llm-chat-without-doc.png
+++ b/docs/source/assets/deployment/anything-llm-chat-without-doc.png
--- a/docs/source/assets/deployment/anything-llm-provider.png
+++ b/docs/source/assets/deployment/anything-llm-provider.png
--- a/docs/source/assets/deployment/anything-llm-upload-doc.png
+++ b/docs/source/assets/deployment/anything-llm-upload-doc.png
--- a/docs/source/assets/deployment/architecture_helm_deployment.png
+++ b/docs/source/assets/deployment/architecture_helm_deployment.png
--- a/docs/source/assets/deployment/chatbox-chat.png
+++ b/docs/source/assets/deployment/chatbox-chat.png
--- a/docs/source/assets/deployment/chatbox-settings.png
+++ b/docs/source/assets/deployment/chatbox-settings.png
--- a/docs/source/assets/deployment/dify-chat.png
+++ b/docs/source/assets/deployment/dify-chat.png
--- a/docs/source/assets/deployment/dify-create-chatbot.png
+++ b/docs/source/assets/deployment/dify-create-chatbot.png
--- a/docs/source/assets/deployment/dify-settings.png
+++ b/docs/source/assets/deployment/dify-settings.png
--- a/docs/source/assets/deployment/open_webui.png
+++ b/docs/source/assets/deployment/open_webui.png
--- a/docs/source/assets/deployment/streamlit-chat.png
+++ b/docs/source/assets/deployment/streamlit-chat.png
--- a/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
+++ b/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
--- a/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
+++ b/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
--- a/docs/source/assets/design/hierarchy.png
+++ b/docs/source/assets/design/hierarchy.png
--- a/docs/source/assets/design/v1/metrics/intervals-1.png
+++ b/docs/source/assets/design/v1/metrics/intervals-1.png
--- a/docs/source/assets/design/v1/metrics/intervals-2.png
+++ b/docs/source/assets/design/v1/metrics/intervals-2.png
--- a/docs/source/assets/design/v1/metrics/intervals-3.png
+++ b/docs/source/assets/design/v1/metrics/intervals-3.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-1.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-3.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-4.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-5.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-6.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-7.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
--- a/docs/source/assets/design/v1/prefix_caching/free.png
+++ b/docs/source/assets/design/v1/prefix_caching/free.png
--- a/docs/source/assets/design/v1/prefix_caching/overview.png
+++ b/docs/source/assets/design/v1/prefix_caching/overview.png
--- a/docs/source/assets/features/disagg_prefill/abstraction.jpg
+++ b/docs/source/assets/features/disagg_prefill/abstraction.jpg
--- a/docs/source/assets/features/disagg_prefill/overview.jpg
+++ b/docs/source/assets/features/disagg_prefill/overview.jpg
--- a/docs/source/assets/kernel/k_vecs.png
+++ b/docs/source/assets/kernel/k_vecs.png
--- a/docs/source/assets/kernel/key.png
+++ b/docs/source/assets/kernel/key.png
--- a/docs/source/assets/kernel/logits_vec.png
+++ b/docs/source/assets/kernel/logits_vec.png
--- a/docs/source/assets/kernel/q_vecs.png
+++ b/docs/source/assets/kernel/q_vecs.png
--- a/docs/source/assets/kernel/query.png
+++ b/docs/source/assets/kernel/query.png
--- a/docs/source/assets/kernel/v_vec.png
+++ b/docs/source/assets/kernel/v_vec.png
--- a/docs/source/assets/kernel/value.png
+++ b/docs/source/assets/kernel/value.png
--- a/docs/source/assets/logos/vllm-logo-only-light.ico
+++ b/docs/source/assets/logos/vllm-logo-only-light.ico
--- a/docs/source/assets/logos/vllm-logo-only-light.png
+++ b/docs/source/assets/logos/vllm-logo-only-light.png
--- a/docs/source/assets/logos/vllm-logo-text-dark.png
+++ b/docs/source/assets/logos/vllm-logo-text-dark.png
--- a/docs/source/assets/logos/vllm-logo-text-light.png
+++ b/docs/source/assets/logos/vllm-logo-text-light.png
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@ -1,6 +1,7 @@
-(meetups)=
+---
-
+title: vLLM Meetups
-# vLLM Meetups
+---
 [](){ #meetups }
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
--- a/docs/source/contributing/deprecation_policy.md
+++ b/docs/source/contributing/deprecation_policy.md
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@ -1,7 +1,7 @@
 # Dockerfile
 We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
-More information about deploying with Docker can be found [here](#deployment-docker).
+More information about deploying with Docker can be found [here][deployment-docker].
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@ -17,11 +17,9 @@ The edges of the build graph represent:
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
-  > :::{figure} /assets/contributing/dockerfile-stages-dependency.png
+  > <figure markdown="span">
-  > :align: center
+  >   ![](../../assets/contributing/dockerfile-stages-dependency.png){ align="center" alt="query" width="100%" }
-  > :alt: query
+  > </figure>
  > :width: 100%
  > :::
  >
  > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
  >
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@ -0,0 +1,23 @@
 ---
 title: Adding a New Model
 ---
 [](){ #new-model }
 This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 Contents:
 - [Basic](basic.md)
 - [Registration](registration.md)
 - [Tests](tests.md)
 - [Multimodal](multimodal.md)
 !!! note
    The complexity of adding a new model depends heavily on the model's architecture.
    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
 !!! tip
    If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
    or ask on our [developer slack](https://slack.vllm.ai).
    We will be happy to help you out!
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@ -1,6 +1,7 @@
-(new-model-basic)=
+---
-
+title: Implementing a Basic Model
-# Implementing a Basic Model
+---
 [](){ #new-model-basic }
 This guide walks you through the steps to implement a basic vLLM model.
@ -10,9 +11,8 @@ First, clone the PyTorch model code from the source repository.
 For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
-:::{warning}
+!!! warning
-Make sure to review and adhere to the original code's copyright and licensing terms!
+    Make sure to review and adhere to the original code's copyright and licensing terms!
 :::
 ## 2. Make your code compatible with vLLM
@ -67,7 +67,7 @@ class MyModel(nn.Module):
        ... 
 ```
- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+- Rewrite the [forward][torch.nn.Module.forward] method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
 ```python
 def forward(
@ -78,10 +78,9 @@ def forward(
    ...
 ```
-:::{note}
+!!! note
-Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
-If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
+    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
 :::
 For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
@ -89,7 +88,7 @@ For reference, check out our [Llama implementation](gh-file:vllm/model_executor/
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
+For the embedding layer, you can simply replace [torch.nn.Embedding][] with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 - `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
@ -107,7 +106,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 ## 5. Register your model
-See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
+See [this page][new-model-registration] for instructions on how to register your new model to be used by vLLM.
 ## Frequently Asked Questions
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -0,0 +1,803 @@
 ---
 title: Multi-Modal Support
 ---
 [](){ #supports-multimodal }
 This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs][multimodal-inputs].
 ## 1. Update the base vLLM model
 It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
 Further update the model as follows:
 - Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
  ```diff
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
  +     pixel_values: torch.Tensor,
    ) -> SamplerOutput:
  ```
  More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
 - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
    ```python
    class YourModelForImage2Seq(nn.Module):
        ...
        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
            assert self.vision_encoder is not None
            image_features = self.vision_encoder(image_input)
            return self.multi_modal_projector(image_features)
        def get_multimodal_embeddings(
                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
            # Validate the multimodal input keyword arguments
            image_input = self._parse_and_validate_image_input(**kwargs)
            if image_input is None:
                return None
            # Run multimodal inputs through encoder and projector
            vision_embeddings = self._process_image_input(image_input)
            return vision_embeddings
    ```
 !!! warning
        The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
 - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
    ```python
    from .utils import merge_multimodal_embeddings
    class YourModelForImage2Seq(nn.Module):
        ...
        def get_input_embeddings(
            self,
            input_ids: torch.Tensor,
            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
        ) -> torch.Tensor:
            # `get_input_embeddings` should already be implemented for the language 
            # model as one of the requirements of basic vLLM model implementation.
            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
            if multimodal_embeddings is not None:
                inputs_embeds = merge_multimodal_embeddings(
                    input_ids=input_ids, 
                    inputs_embeds=inputs_embeds, 
                    multimodal_embeddings=multimodal_embeddings,
                    placeholder_token_id=self.config.image_token_index)
            return inputs_embeds
    ```
 - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
    ```python
    class YourModelForImage2Seq(nn.Module):
        ...
        def get_language_model(self) -> torch.nn.Module:
            # Change `language_model` according to your implementation.
            return self.language_model
    ```
 - Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
  ```diff
  + from vllm.model_executor.models.interfaces import SupportsMultiModal
  - class YourModelForImage2Seq(nn.Module):
  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
  ```
 !!! note
      The model class does not have to be named `*ForCausalLM`.
      Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
 ## 2. Specify processing information
 Next, create a subclass of [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo]
 to provide basic information related to HF processing.
 ### Maximum number of input items
 You need to override the abstract method [get_supported_mm_limits][vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits]
 to return the maximum number of input items for each modality supported by the model.
 For example, if the model supports any number of images but only one video per prompt:
 ```python
 def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"image": None, "video": 1}
 ```
 ## 3. Specify dummy inputs
 Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for
 HF processing as well as memory profiling.
 ### For memory profiling
 Override the abstract methods [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
 Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
 === "Basic example: LLaVA"
    Looking at the code of HF's `LlavaForConditionalGeneration`:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
    n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
    n_image_features = image_features.shape[0] * image_features.shape[1]
    if n_image_tokens != n_image_features:
        raise ValueError(
            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
        )
    special_image_mask = (
        (input_ids == self.config.image_token_index)
        .unsqueeze(-1)
        .expand_as(inputs_embeds)
        .to(inputs_embeds.device)
    )
    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
    ```
    The number of placeholder feature tokens per image is `image_features.shape[1]`.
    `image_features` is calculated inside the `get_image_features` method:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
    image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
    selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
    if vision_feature_select_strategy == "default":
        selected_image_feature = selected_image_feature[:, 1:]
    elif vision_feature_select_strategy == "full":
        selected_image_feature = selected_image_feature
    else:
        raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
    image_features = self.multi_modal_projector(selected_image_feature)
    return image_features
    ```
    We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
    (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
    Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
    The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
    mechanism doesn't change the sequence length of the output hidden states.
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
    hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
    hidden_states = self.pre_layrnorm(hidden_states)
    encoder_outputs = self.encoder(
        inputs_embeds=hidden_states,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    ```
    To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
    target_dtype = self.patch_embedding.weight.dtype
    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
    patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
    if interpolate_pos_encoding:
        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
    else:
        embeddings = embeddings + self.position_embedding(self.position_ids)
    return embeddings
    ```
    We can infer that `embeddings.shape[1] == self.num_positions`, where
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
    self.num_patches = (self.image_size // self.patch_size) ** 2
    self.num_positions = self.num_patches + 1
    ```
    Overall, the number of placeholder feature tokens for an image can be calculated as:
    ```python
    def get_num_image_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> int:
        hf_config = self.get_hf_config()
        hf_processor = self.get_hf_processor()
        image_size = hf_config.vision_config.image_size
        patch_size = hf_config.vision_config.patch_size
        num_image_tokens = (image_size // patch_size) ** 2 + 1
        if hf_processor.vision_feature_select_strategy == "default":
            num_image_tokens -= 1
        return num_image_tokens
    ```
    Notice that the number of image tokens doesn't depend on the image width and height.
    We can simply use a dummy `image_size` to calculate the multimodal profiling data:
    ```python
    # NOTE: In actuality, this is usually implemented as part of the
    # model's subclass of `BaseProcessingInfo`, but we show it as is
    # here for simplicity.
    def get_image_size_with_most_features(self) -> ImageSize:
        hf_config = self.get_hf_config()
        width = height = hf_config.image_size
        return ImageSize(width=width, height=height)
    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }
    ```
    For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
    ```python
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)
        processor = self.info.get_hf_processor()
        image_token = processor.image_token
        return image_token * num_images
    ```
 === "No input placeholders: Fuyu"
    Looking at the code of HF's `FuyuForCausalLM`:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
    if image_patches is not None and past_key_values is None:
        patch_embeddings = [
            self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
            .squeeze(0)
            .to(inputs_embeds.device)
            for patch in image_patches
        ]
        inputs_embeds = self.gather_continuous_embeddings(
            word_embeddings=inputs_embeds,
            continuous_embeddings=patch_embeddings,
            image_patch_input_indices=image_patches_indices,
        )
    ```
    The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
    which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
    Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
    Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
    The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
    `FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
    In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
    returning the dimensions after resizing (but before padding) as metadata.
    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
    image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
    batch_images = image_encoding["images"]
    image_unpadded_heights = image_encoding["image_unpadded_heights"]
    image_unpadded_widths = image_encoding["image_unpadded_widths"]
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
    if do_resize:
        batch_images = [
            [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
            for images in batch_images
        ]
    image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
    image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
    image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
    if do_pad:
        batch_images = [
            [
                self.pad_image(
                    image,
                    size=size,
                    mode=padding_mode,
                    constant_values=padding_value,
                    input_data_format=input_data_format,
                )
                for image in images
            ]
            for images in batch_images
        ]
    ```
    In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
        image_input=tensor_batch_images,
        image_present=image_present,
        image_unpadded_h=image_unpadded_heights,
        image_unpadded_w=image_unpadded_widths,
        image_placeholder_id=image_placeholder_id,
        image_newline_id=image_newline_id,
        variable_sized=True,
    )
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
    image_height, image_width = image.shape[1], image.shape[2]
    if variable_sized:  # variable_sized=True
        new_h = min(
            image_height,
            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
        )
        new_w = min(
            image_width,
            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
        )
        image = image[:, :new_h, :new_w]
        image_height, image_width = new_h, new_w
    num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
    tensor_of_image_ids = torch.full(
        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
    )
    patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
    assert num_patches == patches.shape[0]
    ```
    The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
    patch_size = patch_size if patch_size is not None else self.patch_size
    patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
    if image_height % patch_height != 0:
        raise ValueError(f"{image_height=} must be divisible by {patch_height}")
    if image_width % patch_width != 0:
        raise ValueError(f"{image_width=} must be divisible by {patch_width}")
    num_patches_per_dim_h = image_height // patch_height
    num_patches_per_dim_w = image_width // patch_width
    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
    ```
    These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
    to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
    ```python
    def get_image_size_with_most_features(self) -> ImageSize:
        image_processor = self.get_image_processor()
        return ImageSize(width=image_processor.size["width"],
                            height=image_processor.size["height"])
    ```
    Fuyu does not expect image placeholders in the inputs to HF processor, so
    the dummy prompt text is empty regardless of the number of images.
    ```python
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        return ""
    ```
    For the multimodal image profiling data, the logic is very similar to LLaVA:
    ```python
    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
        num_images = mm_counts.get("image", 0)
        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }
    ```
 ## 4. Specify processing details
 Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]
 to fill in the missing details about HF processing.
 !!! info
    [Multi-Modal Data Processing][mm-processing]
 ### Multi-modal fields
 Override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] to
 return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
 === "Basic example: LLaVA"
    The output of `CLIPImageProcessor` is a simple tensor with shape
    `(num_images, num_channels, image_height, image_width)`:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
    images = [
        to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
        for image in all_images
    ]
    data = {"pixel_values": images}
    return BatchFeature(data=data, tensor_type=return_tensors)
    ```
    So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
    ```python
    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(
            pixel_values=MultiModalFieldConfig.batched("image"),
        )
    ```
    !!! note
        Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
        pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
 === "With postprocessing: Fuyu"
    The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
    the patches from each image belonging to an item in the batch:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
            image_input_ids.append(tensor_of_image_ids)
            image_patches.append(patches)
        else:
            image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
    batch_image_input_ids.append(image_input_ids)
    batch_image_patches.append(image_patches)
    ```
    The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
    `(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
    In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
    we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
    ```python
    def _call_hf_processor(
        self,
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        processed_outputs = super()._call_hf_processor(
            prompt=prompt,
            mm_data=mm_data,
            mm_kwargs=mm_kwargs,
        )
        image_patches = processed_outputs.get("image_patches")
        if image_patches is not None:
            images = mm_data["images"]
            assert isinstance(images, list)
            # Original output: (1, num_images, Pn, Px * Py * C)
            # New output: (num_images, Pn, Px * Py * C)
            assert (isinstance(image_patches, list)
                    and len(image_patches) == 1)
            assert (isinstance(image_patches[0], torch.Tensor)
                    and len(image_patches[0]) == len(images))
            processed_outputs["image_patches"] = image_patches[0]
        return processed_outputs
    ```
    !!! note
        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
        for text-only inputs to prevent unnecessary warnings from HF processor.
    This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
    ```python
    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(image_patches=MultiModalFieldConfig.batched("image"))
    ```
 ### Prompt updates
 Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to
 return a list of [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instances.
 Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies an update operation
 (e.g.: insertion, replacement) performed by the HF processor.
 === "Basic example: LLaVA"
    Looking at HF's `LlavaProcessor`:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
    prompt_strings = []
    for sample in text:
        sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
        prompt_strings.append(sample)
    ```
    It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
    Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
    ```python
    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()
        image_token_id = hf_config.image_token_index
        def get_replacement(item_idx: int):
            images = mm_items.get_items("image", ImageProcessorItems)
            image_size = images.get_image_size(item_idx)
            num_image_tokens = self.info.get_num_image_tokens(
                image_width=image_size.width,
                image_height=image_size.height,
            )
            return [image_token_id] * num_image_tokens
        return [
            PromptReplacement(
                modality="image",
                target=[image_token_id],
                replacement=get_replacement,
            ),
        ]
    ```
 === "Handling additional tokens: Fuyu"
    Recall the layout of feature tokens from Step 2:
    ```
    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
    ...
    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
    ```
    We define a helper function to return `ncols` and `nrows` directly:
    ```python
    def get_image_feature_grid_size(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> tuple[int, int]:
        image_processor = self.get_image_processor()
        target_width = image_processor.size["width"]
        target_height = image_processor.size["height"]
        patch_width = image_processor.patch_size["width"]
        patch_height = image_processor.patch_size["height"]
        if not (image_width <= target_width and image_height <= target_height):
            height_scale_factor = target_height / image_height
            width_scale_factor = target_width / image_width
            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
            image_height = int(image_height * optimal_scale_factor)
            image_width = int(image_width * optimal_scale_factor)
        ncols = math.ceil(image_width / patch_width)
        nrows = math.ceil(image_height / patch_height)
        return ncols, nrows
    ```
    Based on this, we can initially define our replacement tokens as:
    ```python
    def get_replacement(item_idx: int):
        images = mm_items.get_items("image", ImageProcessorItems)
        image_size = images.get_image_size(item_idx)
        ncols, nrows = self.info.get_image_feature_grid_size(
            image_width=image_size.width,
            image_height=image_size.height,
        )
        # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
        # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
        return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
    ```
    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
    a BOS token (`<s>`) is also added to the promopt:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
        image_input=tensor_batch_images,
        image_present=image_present,
        image_unpadded_h=image_unpadded_heights,
        image_unpadded_w=image_unpadded_widths,
        image_placeholder_id=image_placeholder_id,
        image_newline_id=image_newline_id,
        variable_sized=True,
    )
    prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
        tokenizer=self.tokenizer,
        prompts=prompts,
        scale_factors=scale_factors,
        max_tokens_to_generate=self.max_tokens_to_generate,
        max_position_embeddings=self.max_position_embeddings,
        add_BOS=True,
        add_beginning_of_answer_token=True,
    )
    ```
    To assign the vision embeddings to only the image tokens, instead of a string
    you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
    ```python
    hf_config = self.info.get_hf_config()
    bos_token_id = hf_config.bos_token_id  # `<s>`
    assert isinstance(bos_token_id, int)
    def get_replacement_fuyu(item_idx: int):
        images = mm_items.get_items("image", ImageProcessorItems)
        image_size = images.get_image_size(item_idx)
        ncols, nrows = self.info.get_image_feature_grid_size(
            image_width=image_size.width,
            image_height=image_size.height,
        )
        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                        [_NEWLINE_TOKEN_ID]) * nrows
        return PromptUpdateDetails.select_token_id(
            image_tokens + [bos_token_id],
            embed_token_id=_IMAGE_TOKEN_ID,
        )
    ```
    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
    we can search for it to conduct the replacement at the start of the string:
    ```python
    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()
        bos_token_id = hf_config.bos_token_id
        assert isinstance(bos_token_id, int)
        tokenizer = self.info.get_tokenizer()
        eot_token_id = tokenizer.bos_token_id
        assert isinstance(eot_token_id, int)
        def get_replacement_fuyu(item_idx: int):
            images = mm_items.get_items("image", ImageProcessorItems)
            image_size = images.get_image_size(item_idx)
            ncols, nrows = self.info.get_image_feature_grid_size(
                image_width=image_size.width,
                image_height=image_size.height,
            )
            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                            [_NEWLINE_TOKEN_ID]) * nrows
            return PromptUpdateDetails.select_token_id(
                image_tokens + [bos_token_id],
                embed_token_id=_IMAGE_TOKEN_ID,
            )
        return [
            PromptReplacement(
                modality="image",
                target=[eot_token_id],
                replacement=get_replacement_fuyu,
            )
        ]
    ```
 ## 5. Register processor-related classes
 After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
 [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
 and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
 decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
 to register them to the multi-modal registry:
 ```diff
  from vllm.model_executor.models.interfaces import SupportsMultiModal
 + from vllm.multimodal import MULTIMODAL_REGISTRY
 + @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
 +                                         info=YourProcessingInfo,
 +                                         dummy_inputs=YourDummyInputsBuilder)
  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
 ## Notes
 ### Inserting feature tokens without replacement
 Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use [PromptInsertion][vllm.multimodal.processing.PromptInsertion] instead of [PromptReplacement][vllm.multimodal.processing.PromptReplacement] inside [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates].
 Examples:
 - BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
 - Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
 - Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
 ### Handling prompt updates unrelated to multi-modal data
 [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design][mm-processing].
 Examples:
 - Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
 - Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
 - Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
 ### Custom HF processor
 Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
 Examples:
 - DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
 - InternVL: <gh-file:vllm/model_executor/models/internvl.py>
 - Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@ -1,33 +1,32 @@
-(new-model-registration)=
+---
-
+title: Registering a Model to vLLM
-# Registering a Model to vLLM
+---
 [](){ #new-model-registration }
 vLLM relies on a model registry to determine how to run each model.
-A list of pre-registered architectures can be found [here](#supported-models).
+A list of pre-registered architectures can be found [here][supported-models].
 If your model is not on this list, you must register it to vLLM.
 This page provides detailed instructions on how to do so.
 ## Built-in models
-To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
 This gives you the ability to modify the codebase and test your model.
-After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
+After you have implemented your model (see [tutorial][new-model-basic]), put it into the <gh-dir:vllm/model_executor/models> directory.
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
-Finally, update our [list of supported models](#supported-models) to promote your model!
+Finally, update our [list of supported models][supported-models] to promote your model!
-:::{important}
+!!! warning
-The list of models in each section should be maintained in alphabetical order.
+    The list of models in each section should be maintained in alphabetical order.
 :::
 ## Out-of-tree models
 You can load an external model using a plugin without modifying the vLLM codebase.
-:::{seealso}
+!!! info
-[vLLM's Plugin System](#plugin-system)
+    [vLLM's Plugin System][plugin-system]
 :::
 To register the model, use the following code:
@ -45,11 +44,9 @@ from vllm import ModelRegistry
 ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
 ```
-:::{important}
+!!! warning
-If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+    If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
-Read more about that [here](#supports-multimodal).
+    Read more about that [here][supports-multimodal].
 :::
-:::{note}
+!!! note
-Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
+    Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
 :::
--- a/docs/source/contributing/model/tests.md
+++ b/docs/source/contributing/model/tests.md
@ -1,6 +1,7 @@
-(new-model-tests)=
+---
-
+title: Writing Unit Tests
-# Writing Unit Tests
+---
 [](){ #new-model-tests }
 This page explains how to write unit tests to verify the implementation of your model.
@ -14,14 +15,12 @@ Without them, the CI for your PR will fail.
 Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
-:::{important}
+!!! warning
-The list of models in each section should be maintained in alphabetical order.
+    The list of models in each section should be maintained in alphabetical order.
 :::
-:::{tip}
+!!! tip
-If your model requires a development version of HF Transformers, you can set
+    If your model requires a development version of HF Transformers, you can set
-`min_transformers_version` to skip the test in CI until the model is released.
+    `min_transformers_version` to skip the test in CI until the model is released.
 :::
 ## Optional Tests
@ -34,16 +33,16 @@ These tests compare the model outputs of vLLM against [HF Transformers](https://
 #### Generative models
-For [generative models](#generative-models), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+For [generative models][generative-models], there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
 - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
 - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
 #### Pooling models
-For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>.
+For [pooling models][pooling-models], we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>.
-(mm-processing-tests)=
+[](){ #mm-processing-tests }
 ### Multi-modal processing
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@ -27,7 +27,21 @@ See <gh-file:LICENSE>.
 ## Developing
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
-Check out the [building from source](#build-from-source) documentation for details.
+Check out the [building from source][build-from-source] documentation for details.
 ### Building the docs
 Install the dependencies:
 ```bash
 pip install -r requirements/docs.txt
 ```
 Start the autoreloading MkDocs server:
 ```bash
 mkdocs serve
 ```
 ## Testing
@ -48,29 +62,25 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files
 pytest tests/
 ```
-:::{tip}
+!!! tip
-Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+    Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
-Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
+    Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 :::
-:::{note}
+!!! note
-Currently, the repository is not fully checked by `mypy`.
+    Currently, the repository is not fully checked by `mypy`.
 :::
-:::{note}
+!!! note
-Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
+    Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
-platform to run unit tests locally, rely on the continuous integration system to run the tests for
+    platform to run unit tests locally, rely on the continuous integration system to run the tests for
-now.
+    now.
 :::
 ## Issues
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
-:::{important}
+!!! warning
-If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+    If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
 :::
 ## Pull Requests & Code Reviews
@ -106,9 +116,8 @@ appropriately to indicate the type of change. Please use one of the following:
 - `[Misc]` for PRs that do not fit the above categories. Please use this
  sparingly.
-:::{note}
+!!! note
-If the PR spans more than one category, please include all relevant prefixes.
+    If the PR spans more than one category, please include all relevant prefixes.
 :::
 ### Code Quality
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@ -1,8 +1,7 @@
 # Profiling vLLM
-:::{warning}
+!!! warning
-Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
+    Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
 :::
 ## Profile with PyTorch Profiler
@ -14,15 +13,13 @@ When using `benchmarks/benchmark_serving.py`, you can enable profiling by passin
 Traces can be visualized using <https://ui.perfetto.dev/>.
-:::{tip}
+!!! tip
-Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
+    Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
 :::
-:::{tip}
+!!! tip
-To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+    To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+    Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
-`export VLLM_RPC_TIMEOUT=1800000`
+    `export VLLM_RPC_TIMEOUT=1800000`
 :::
 ### Example commands and usage
--- a/docs/source/contributing/vulnerability_management.md
+++ b/docs/source/contributing/vulnerability_management.md
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@ -0,0 +1,126 @@
 ---
 title: Using Docker
 ---
 [](){ #deployment-docker }
 [](){ #deployment-docker-pre-built-image }
 ## Use vLLM's Official Docker Image
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
 ```console
 $ docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model mistralai/Mistral-7B-v0.1
 ```
 This image can also be used with other container engines such as [Podman](https://podman.io/).
 ```console
 $ podman run --gpus all \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
  -p 8000:8000 \
  --ipc=host \
  vllm/vllm-openai:latest \
  --model mistralai/Mistral-7B-v0.1
 ```
 You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`).
 !!! note
    You can either use the `ipc=host` flag or `--shm-size` flag to allow the
    container to access the host's shared memory. vLLM uses PyTorch, which uses shared
    memory to share data between processes under the hood, particularly for tensor parallel inference.
 !!! note
    Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
    If you need to use those dependencies (having accepted the license terms),
    create a custom Dockerfile on top of the base image with an extra layer that installs them:
    ```Dockerfile
    FROM vllm/vllm-openai:v0.8.3
    # e.g. install the `audio` optional dependencies
    # NOTE: Make sure the version of vLLM matches the base image!
    RUN uv pip install --system vllm[audio]==0.8.3
    ```
 !!! tip
    Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
    To use the development version of `transformers`, create a custom Dockerfile on top of the base image
    with an extra layer that installs their code from source:
    ```Dockerfile
    FROM vllm/vllm-openai:latest
    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
    ```
 [](){ #deployment-docker-build-image-from-source }
 ## Building vLLM's Docker Image from Source
 You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
 ```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
 DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
 ```
 !!! note
    By default vLLM will build for all GPU types for widest distribution. If you are just building for the
    current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
    for vLLM to find the current GPU type and build for that.
    If you are using Podman instead of Docker, you might need to disable SELinux labeling by
    adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
 ## Building for Arm64/aarch64
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
 of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 !!! note
    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
 ```console
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
 $ python3 use_existing_torch.py
 $ DOCKER_BUILDKIT=1 docker build . \
  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t vllm/vllm-gh200-openai:latest \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
 ```
 ## Use the custom-built vLLM Docker image
 To run vLLM with the custom-built Docker image:
 ```console
 $ docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
    vllm/vllm-openai <args...>
 ```
 The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
 !!! note
    **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
--- a/docs/source/deployment/frameworks/anything-llm.md
+++ b/docs/source/deployment/frameworks/anything-llm.md
@ -1,6 +1,7 @@
-(deployment-anything-llm)=
+---
-
+title: Anything LLM
-# Anything LLM
+---
 [](){ #deployment-anything-llm }
 [Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
@ -25,23 +26,19 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
  - Base URL: http://{vllm server host}:{vllm server port}/v1
  - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
-:::{image} /assets/deployment/anything-llm-provider.png
+![](../../assets/deployment/anything-llm-provider.png)
 :::
 - Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
-:::{image} /assets/deployment/anything-llm-chat-without-doc.png
+![](../../assets/deployment/anything-llm-chat-without-doc.png)
 :::
 - Click the upload button:
  - upload the doc
  - select the doc and move to the workspace
  - save and embed
-:::{image} /assets/deployment/anything-llm-upload-doc.png
+![](../../assets/deployment/anything-llm-upload-doc.png)
 :::
 - Chat again:
-:::{image} /assets/deployment/anything-llm-chat-with-doc.png
+![](../../assets/deployment/anything-llm-chat-with-doc.png)
 :::
--- a/docs/source/deployment/frameworks/bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@ -1,6 +1,7 @@
-(deployment-bentoml)=
+---
-
+title: BentoML
-# BentoML
+---
 [](){ #deployment-bentoml }
 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@ -1,12 +1,11 @@
-(deployment-cerebrium)=
+---
 title: Cerebrium
 ---
 [](){ #deployment-cerebrium }
 # Cerebrium
 :::{raw} html
 <p align="center">
    <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
 </p>
 :::
 vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
--- a/docs/source/deployment/frameworks/chatbox.md
+++ b/docs/source/deployment/frameworks/chatbox.md
@ -1,6 +1,7 @@
-(deployment-chatbox)=
+---
-
+title: Chatbox
-# Chatbox
+---
 [](){ #deployment-chatbox }
 [Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
@ -27,10 +28,8 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
  - API Path: `/chat/completions`
  - Model: `qwen/Qwen1.5-0.5B-Chat`
-:::{image} /assets/deployment/chatbox-settings.png
+![](../../assets/deployment/chatbox-settings.png)
 :::
 - Go to `Just chat`, and start to chat:
-:::{image} /assets/deployment/chatbox-chat.png
+![](../../assets/deployment/chatbox-chat.png)
 :::
--- a/docs/source/deployment/frameworks/dify.md
+++ b/docs/source/deployment/frameworks/dify.md
@ -1,6 +1,7 @@
-(deployment-dify)=
+---
-
+title: Dify
-# Dify
+---
 [](){ #deployment-dify }
 [Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
@ -42,15 +43,12 @@ docker compose up -d
  - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
  - **Completion Mode**: `Completion`
-:::{image} /assets/deployment/dify-settings.png
+![](../../assets/deployment/dify-settings.png)
 :::
 - To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
-:::{image} /assets/deployment/dify-create-chatbot.png
+![](../../assets/deployment/dify-create-chatbot.png)
 :::
 - Click the chatbot you just created to open the chat interface and start interacting with the model:
-:::{image} /assets/deployment/dify-chat.png
+![](../../assets/deployment/dify-chat.png)
 :::
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@ -1,12 +1,11 @@
-(deployment-dstack)=
+---
 title: dstack
 ---
 [](){ #deployment-dstack }
 # dstack
 :::{raw} html
 <p align="center">
    <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
 </p>
 :::
 vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
@ -97,6 +96,5 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
-:::{note}
+!!! note
-dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
+    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
 :::
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@ -0,0 +1,95 @@
 ---
 title: Helm
 ---
 [](){ #deployment-helm }
 A Helm chart to deploy vLLM for Kubernetes
 Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
 This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
 ## Prerequisites
 Before you begin, ensure that you have the following:
 - A running Kubernetes cluster
 - NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 - Available GPU resources in your cluster
 - S3 with the model which will be deployed
 ## Installing the chart
 To install the chart with the release name `test-vllm`:
 ```console
 helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
 ```
 ## Uninstalling the Chart
 To uninstall the `test-vllm` deployment:
 ```console
 helm uninstall test-vllm --namespace=ns-vllm
 ```
 The command removes all the Kubernetes components associated with the
 chart **including persistent volumes** and deletes the release.
 ## Architecture
 ![](../../assets/deployment/architecture_helm_deployment.png)
 ## Values
 | Key                                        | Type    | Default                                                                                                                                                  | Description                                                                                                                               |
 |--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
 | autoscaling                                | object  | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}                                                                  | Autoscaling configuration                                                                                                                 |
 | autoscaling.enabled                        | bool    | false                                                                                                                                                    | Enable autoscaling                                                                                                                        |
 | autoscaling.maxReplicas                    | int     | 100                                                                                                                                                      | Maximum replicas                                                                                                                          |
 | autoscaling.minReplicas                    | int     | 1                                                                                                                                                        | Minimum replicas                                                                                                                          |
 | autoscaling.targetCPUUtilizationPercentage | int     | 80                                                                                                                                                       | Target CPU utilization for autoscaling                                                                                                    |
 | configs                                    | object  | {}                                                                                                                                                       | Configmap                                                                                                                                 |
 | containerPort                              | int     | 8000                                                                                                                                                     | Container port                                                                                                                            |
 | customObjects                              | list    | []                                                                                                                                                       | Custom Objects configuration                                                                                                              |
 | deploymentStrategy                         | object  | {}                                                                                                                                                       | Deployment strategy configuration                                                                                                         |
 | externalConfigs                            | list    | []                                                                                                                                                       | External configuration                                                                                                                    |
 | extraContainers                            | list    | []                                                                                                                                                       | Additional containers configuration                                                                                                       |
 | extraInit                                  | object  | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}                                                     | Additional configuration for the init container                                                                                           |
 | extraInit.pvcStorage                       | string  | "50Gi"                                                                                                                                                   | Storage size of the s3                                                                                                                    |
 | extraInit.s3modelpath                      | string  | "relative_s3_model_path/opt-125m"                                                                                                                        | Path of the model on the s3 which hosts model weights and config files                                                                    |
 | extraInit.awsEc2MetadataDisabled           | boolean | true                                                                                                                                                     | Disables the use of the Amazon EC2 instance metadata service                                                                              |
 | extraPorts                                 | list    | []                                                                                                                                                       | Additional ports configuration                                                                                                            |
 | gpuModels                                  | list    | ["TYPE_GPU_USED"]                                                                                                                                        | Type of gpu used                                                                                                                          |
 | image                                      | object  | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration                                                                                                                       |
 | image.command                              | list    | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]                                                            | Container launch command                                                                                                                  |
 | image.repository                           | string  | "vllm/vllm-openai"                                                                                                                                       | Image repository                                                                                                                          |
 | image.tag                                  | string  | "latest"                                                                                                                                                 | Image tag                                                                                                                                 |
 | livenessProbe                              | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}                                              | Liveness probe configuration                                                                                                              |
 | livenessProbe.failureThreshold             | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
 | livenessProbe.httpGet                      | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
 | livenessProbe.httpGet.path                 | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
 | livenessProbe.httpGet.port                 | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
 | livenessProbe.initialDelaySeconds          | int     | 15                                                                                                                                                       | Number of seconds after the container has started before liveness probe is initiated                                                      |
 | livenessProbe.periodSeconds                | int     | 10                                                                                                                                                       | How often (in seconds) to perform the liveness probe                                                                                      |
 | maxUnavailablePodDisruptionBudget          | string  | ""                                                                                                                                                       | Disruption Budget Configuration                                                                                                           |
 | readinessProbe                             | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}                                                | Readiness probe configuration                                                                                                             |
 | readinessProbe.failureThreshold            | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
 | readinessProbe.httpGet                     | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
 | readinessProbe.httpGet.path                | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
 | readinessProbe.httpGet.port                | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
 | readinessProbe.initialDelaySeconds         | int     | 5                                                                                                                                                        | Number of seconds after the container has started before readiness probe is initiated                                                     |
 | readinessProbe.periodSeconds               | int     | 5                                                                                                                                                        | How often (in seconds) to perform the readiness probe                                                                                     |
 | replicaCount                               | int     | 1                                                                                                                                                        | Number of replicas                                                                                                                        |
 | resources                                  | object  | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}                                          | Resource configuration                                                                                                                    |
 | resources.limits."nvidia.com/gpu"          | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
 | resources.limits.cpu                       | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
 | resources.limits.memory                    | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
 | resources.requests."nvidia.com/gpu"        | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
 | resources.requests.cpu                     | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
 | resources.requests.memory                  | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
 | secrets                                    | object  | {}                                                                                                                                                       | Secrets configuration                                                                                                                     |
 | serviceName                                | string  | Service name                                                                                                                                             |                                                                                                                                           |
 | servicePort                                | int     | 80                                                                                                                                                       | Service port                                                                                                                              |
 | labels.environment                         | string  | test                                                                                                                                                     | Environment name                                                                                                                          |
--- a/docs/source/deployment/frameworks/litellm.md
+++ b/docs/source/deployment/frameworks/litellm.md
@ -1,6 +1,7 @@
-(deployment-litellm)=
+---
-
+title: LiteLLM
-# LiteLLM
+---
 [](){ #deployment-litellm }
 [LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
--- a/docs/source/deployment/frameworks/lobe-chat.md
+++ b/docs/source/deployment/frameworks/lobe-chat.md
@ -1,6 +1,7 @@
-(deployment-lobe-chat)=
+---
-
+title: Lobe Chat
-# Lobe Chat
+---
 [](){ #deployment-lobe-chat }
 [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
--- a/docs/source/deployment/frameworks/lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@ -1,6 +1,7 @@
-(deployment-lws)=
+---
-
+title: LWS
-# LWS
+---
 [](){ #deployment-lws }
 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
--- a/docs/source/deployment/frameworks/modal.md
+++ b/docs/source/deployment/frameworks/modal.md
@ -1,6 +1,7 @@
-(deployment-modal)=
+---
-
+title: Modal
-# Modal
+---
 [](){ #deployment-modal }
 vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
--- a/docs/source/deployment/frameworks/open-webui.md
+++ b/docs/source/deployment/frameworks/open-webui.md
@ -1,6 +1,7 @@
-(deployment-open-webui)=
+---
-
+title: Open WebUI
-# Open WebUI
+---
 [](){ #deployment-open-webui }
 1. Install the [Docker](https://docs.docker.com/engine/install/)
@ -25,5 +26,4 @@ ghcr.io/open-webui/open-webui:main
 On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
-:::{image} /assets/deployment/open_webui.png
+![](../../assets/deployment/open_webui.png)
 :::
--- a/docs/source/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/source/deployment/frameworks/retrieval_augmented_generation.md
@ -1,6 +1,7 @@
-(deployment-retrieval-augmented-generation)=
+---
-
+title: Retrieval-Augmented Generation
-# Retrieval-Augmented Generation
+---
 [](){ #deployment-retrieval-augmented-generation }
 [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@ -1,12 +1,11 @@
-(deployment-skypilot)=
+---
 title: SkyPilot
 ---
 [](){ #deployment-skypilot }
 # SkyPilot
 :::{raw} html
 <p align="center">
  <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
 :::
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
@ -104,10 +103,8 @@ service:
  max_completion_tokens: 1
 ```
 :::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
 :::
 ```yaml
 service:
@ -153,9 +150,7 @@ run: |
    2>&1 | tee api_server.log
 ```
 :::{raw} html
 </details>
 :::
 Start the serving the Llama-3 8B model on multiple replicas:
@ -169,10 +164,8 @@ Wait until the service is ready:
 watch -n10 sky serve status vllm
 ```
 :::{raw} html
 <details>
 <summary>Example outputs:</summary>
 :::
 ```console
 Services
@ -185,9 +178,7 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```
 :::{raw} html
 </details>
 :::
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
@ -223,10 +214,8 @@ service:
 This will scale the service up to when the QPS exceeds 2 for each replica.
 :::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
 :::
 ```yaml
 service:
@ -275,9 +264,7 @@ run: |
    2>&1 | tee api_server.log
 ```
 :::{raw} html
 </details>
 :::
 To update the service with the new config:
@ -295,10 +282,8 @@ sky serve down vllm
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
 :::{raw} html
 <details>
 <summary>Click to see the full GUI YAML</summary>
 :::
 ```yaml
 envs:
@ -328,9 +313,7 @@ run: |
    --stop-token-ids 128009,128001 | tee ~/gradio.log
 ```
 :::{raw} html
 </details>
 :::
 1. Start the chat web UI:
--- a/docs/source/deployment/frameworks/streamlit.md
+++ b/docs/source/deployment/frameworks/streamlit.md
@ -1,6 +1,7 @@
-(deployment-streamlit)=
+---
-
+title: Streamlit
-# Streamlit
+---
 [](){ #deployment-streamlit }
 [Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
@ -38,5 +39,4 @@ VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run stream
 streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
 ```
-:::{image} /assets/deployment/streamlit-chat.png
+![](../../assets/deployment/streamlit-chat.png)
 :::
--- a/docs/source/deployment/frameworks/triton.md
+++ b/docs/source/deployment/frameworks/triton.md
@ -1,5 +1,6 @@
-(deployment-triton)=
+---
-
+title: NVIDIA Triton
-# NVIDIA Triton
+---
 [](){ #deployment-triton }
 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
--- a/docs/source/deployment/integrations/kserve.md
+++ b/docs/source/deployment/integrations/kserve.md
@ -1,6 +1,7 @@
-(deployment-kserve)=
+---
-
+title: KServe
-# KServe
+---
 [](){ #deployment-kserve }
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
--- a/docs/source/deployment/integrations/kubeai.md
+++ b/docs/source/deployment/integrations/kubeai.md
@ -1,6 +1,7 @@
-(deployment-kubeai)=
+---
-
+title: KubeAI
-# KubeAI
+---
 [](){ #deployment-kubeai }
 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
--- a/docs/source/deployment/integrations/llamastack.md
+++ b/docs/source/deployment/integrations/llamastack.md
@ -1,6 +1,7 @@
-(deployment-llamastack)=
+---
-
+title: Llama Stack
-# Llama Stack
+---
 [](){ #deployment-llamastack }
 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
--- a/docs/source/deployment/integrations/llmaz.md
+++ b/docs/source/deployment/integrations/llmaz.md
@ -1,6 +1,7 @@
-(deployment-llmaz)=
+---
-
+title: llmaz
-# llmaz
+---
 [](){ #deployment-llmaz }
 [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
--- a/docs/source/deployment/integrations/production-stack.md
+++ b/docs/source/deployment/integrations/production-stack.md
@ -1,6 +1,7 @@
-(deployment-production-stack)=
+---
-
+title: Production stack
-# Production stack
+---
 [](){ #deployment-production-stack }
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
@ -114,7 +115,7 @@ To remove the deployment, run:
 sudo helm uninstall vllm
 ```
------
+---
 ### (Advanced) Configuring vLLM production stack
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -1,6 +1,7 @@
-(deployment-k8s)=
+---
-
+title: Using Kubernetes
-# Using Kubernetes
+---
 [](){ #deployment-k8s }
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
@ -19,9 +20,8 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 ## Deployment with CPUs
-:::{note}
+!!! note
-The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
+    The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
 :::
 First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@ -1,20 +1,21 @@
-(nginxloadbalancer)=
+---
-
+title: Using Nginx
-# Using Nginx
+---
 [](){ #nginxloadbalancer }
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 Table of contents:
-1. [Build Nginx Container](#nginxloadbalancer-nginx-build)
+1. [Build Nginx Container][nginxloadbalancer-nginx-build]
-2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf)
+2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf]
-3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container)
+3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container]
-4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network)
+4. [Create Docker Network][nginxloadbalancer-nginx-docker-network]
-5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container)
+5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container]
-6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx)
+6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx]
-7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx)
+7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx]
-(nginxloadbalancer-nginx-build)=
+[](){ #nginxloadbalancer-nginx-build }
 ## Build Nginx Container
@ -39,7 +40,7 @@ Build the container:
 docker build . -f Dockerfile.nginx --tag nginx-lb
 ```
-(nginxloadbalancer-nginx-conf)=
+[](){ #nginxloadbalancer-nginx-conf }
 ## Create Simple Nginx Config file
@ -63,7 +64,7 @@ server {
 }
 ```
-(nginxloadbalancer-nginx-vllm-container)=
+[](){ #nginxloadbalancer-nginx-vllm-container }
 ## Build vLLM Container
@ -79,7 +80,7 @@ cd $vllm_root
 docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
 ```
-(nginxloadbalancer-nginx-docker-network)=
+[](){ #nginxloadbalancer-nginx-docker-network }
 ## Create Docker Network
@ -87,7 +88,7 @@ docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_prox
 docker network create vllm_nginx
 ```
-(nginxloadbalancer-nginx-launch-container)=
+[](){ #nginxloadbalancer-nginx-launch-container }
 ## Launch vLLM Containers
@ -105,11 +106,10 @@ docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24
 docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
 ```
-:::{note}
+!!! note
-If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
+    If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
 :::
-(nginxloadbalancer-nginx-launch-nginx)=
+[](){ #nginxloadbalancer-nginx-launch-nginx }
 ## Launch Nginx
@ -117,7 +117,7 @@ If you are behind proxy, you can pass the proxy settings to the docker run comma
 docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
 ```
-(nginxloadbalancer-nginx-verify-nginx)=
+[](){ #nginxloadbalancer-nginx-verify-nginx }
 ## Verify That vLLM Servers Are Ready
--- a/docs/source/deployment/security.md
+++ b/docs/source/deployment/security.md
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@ -1,22 +1,18 @@
-(arch-overview)=
+---
-
+title: Architecture Overview
-# Architecture Overview
+---
 [](){ #arch-overview }
 This document provides an overview of the vLLM architecture.
-:::{contents} Table of Contents
+[TOC]
 :depth: 2
 :local: true
 :::
 ## Entrypoints
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
-:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+![Entrypoints Diagram](../assets/design/arch_overview/entrypoints.excalidraw.png)
 :alt: Entrypoints Diagram
 :::
 ### LLM Class
@ -77,16 +73,14 @@ python -m vllm.entrypoints.openai.api_server --model <model>
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
-More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
+More details on the API server can be found in the [OpenAI-Compatible Server][openai-compatible-server] document.
 ## LLM Engine
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
-:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+![LLMEngine Diagram](../assets/design/arch_overview/llm_engine.excalidraw.png)
 :alt: LLMEngine Diagram
 :::
 ### LLMEngine
@ -137,18 +131,16 @@ input tensors and capturing cudagraphs.
 ## Model
 Every model runner object has one model object, which is the actual
-`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
+`torch.nn.Module` instance. See [huggingface_integration][huggingface-integration] for how various
 configurations affect the class we ultimately get.
 ## Class Hierarchy
 The following figure shows the class hierarchy of vLLM:
-> :::{figure} /assets/design/hierarchy.png
+> <figure markdown="span">
-> :align: center
+>   ![](../assets/design/hierarchy.png){ align="center" alt="query" width="100%" }
-> :alt: query
+> </figure>
 > :width: 100%
 > :::
 There are several important design choices behind this class hierarchy:
@ -178,44 +170,43 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
-:::{note}
+!!! note
-To support this change, all vLLM models' signatures have been updated to:
+    To support this change, all vLLM models' signatures have been updated to:
-```python
+    ```python
 def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 ```
 To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
 ```python
 class MyOldModel(nn.Module):
    def __init__(
        self,
        config,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        lora_config: Optional[LoRAConfig] = None,
        prefix: str = "",
    ) -> None:
        ...
 from vllm.config import VllmConfig
 class MyNewModel(MyOldModel):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        config = vllm_config.model_config.hf_config
+    ```
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
        lora_config = vllm_config.lora_config
        super().__init__(config, cache_config, quant_config, lora_config, prefix)
-if __version__ >= "0.6.4":
+    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
    MyModel = MyNewModel
 else:
    MyModel = MyOldModel
 ```
-This way, the model can work with both old and new versions of vLLM.
+    ```python
-:::
+    class MyOldModel(nn.Module):
        def __init__(
            self,
            config,
            cache_config: Optional[CacheConfig] = None,
            quant_config: Optional[QuantizationConfig] = None,
            lora_config: Optional[LoRAConfig] = None,
            prefix: str = "",
        ) -> None:
            ...
    from vllm.config import VllmConfig
    class MyNewModel(MyOldModel):
        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
            config = vllm_config.model_config.hf_config
            cache_config = vllm_config.cache_config
            quant_config = vllm_config.quant_config
            lora_config = vllm_config.lora_config
            super().__init__(config, cache_config, quant_config, lora_config, prefix)
    if __version__ >= "0.6.4":
        MyModel = MyNewModel
    else:
        MyModel = MyOldModel
    ```
    This way, the model can work with both old and new versions of vLLM.
 3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/source/design/automatic_prefix_caching.md
@ -1,6 +1,7 @@
-(design-automatic-prefix-caching)=
+---
-
+title: Automatic Prefix Caching
-# Automatic Prefix Caching
+---
 [](){ #design-automatic-prefix-caching }
 The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
--- a/docs/source/design/huggingface_integration.md
+++ b/docs/source/design/huggingface_integration.md
@ -1,6 +1,7 @@
-(huggingface-integration)=
+---
-
+title: Integration with HuggingFace
-# Integration with HuggingFace
+---
 [](){ #huggingface-integration }
 This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@ -1,6 +1,7 @@
-(design-paged-attention)=
+---
-
+title: vLLM Paged Attention
-# vLLM Paged Attention
+---
 [](){ #design-paged-attention }
 - Currently, vLLM utilizes its own implementation of a multi-head query
  attention kernel (`csrc/attention/attention_kernels.cu`).
@ -139,26 +140,22 @@
  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
  ```
-  :::{figure} ../../assets/kernel/query.png
+  <figure markdown="span">
-  :align: center
+    ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" }
-  :alt: query
+    <figcaption>
-  :width: 70%
+</figcaption>
-
+  </figure>
  Query data of one token at one head
  :::
 - Each thread defines its own `q_ptr` which points to the assigned
  query token data on global memory. For example, if `VEC_SIZE` is 4
  and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
  total of 128 elements divided into 128 / 4 = 32 vecs.
-  :::{figure} ../../assets/kernel/q_vecs.png
+  <figure markdown="span">
-  :align: center
+    ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
-  :alt: q_vecs
+    <figcaption>
-  :width: 70%
+</figcaption>
-
+  </figure>
  `q_vecs` for one thread group
  :::
  ```cpp
  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@ -195,13 +192,11 @@
  points to key token data based on `k_cache` at assigned block,
  assigned head and assigned token.
-  :::{figure} ../../assets/kernel/key.png
+  <figure markdown="span">
-  :align: center
+    ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" }
-  :alt: key
+    <figcaption>
-  :width: 70%
+</figcaption>
-
+  </figure>
  Key data of all context tokens at one head
  :::
 - The diagram above illustrates the memory layout for key data. It
  assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@ -214,13 +209,11 @@
  elements for one token) that will be processed by 2 threads (one
  thread group) separately.
-  :::{figure} ../../assets/kernel/k_vecs.png
+  <figure markdown="span">
-  :align: center
+    ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
-  :alt: k_vecs
+    <figcaption>
-  :width: 70%
+</figcaption>
-
+  </figure>
  `k_vecs` for one thread
  :::
  ```cpp
  K_vec k_vecs[NUM_VECS_PER_THREAD]
@ -289,14 +282,12 @@
  should be performed across the entire thread block, encompassing
  results between the query token and all context key tokens.
-  :::{math}
+  $$
  :nowrap: true
  \begin{gather*}
  m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
  \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
  \end{gather*}
-  :::
+  $$
 ### `qk_max` and `logits`
@ -379,29 +370,23 @@
 ## Value
-:::{figure} ../../assets/kernel/value.png
+<figure markdown="span">
-:align: center
+  ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" }
-:alt: value
+  <figcaption>
-:width: 70%
+</figcaption>
 </figure>
-Value data of all context tokens at one head
+<figure markdown="span">
-:::
+  ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
  <figcaption>
 </figcaption>
 </figure>
-:::{figure} ../../assets/kernel/logits_vec.png
+<figure markdown="span">
-:align: center
+  ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" }
-:alt: logits_vec
+  <figcaption>
-:width: 50%
+</figcaption>
-
+</figure>
 `logits_vec` for one thread
 :::
 :::{figure} ../../assets/kernel/v_vec.png
 :align: center
 :alt: v_vec
 :width: 70%
 List of `v_vec` for one thread
 :::
 - Now we need to retrieve the value data and perform dot multiplication
  with `logits`. Unlike query and key, there is no thread group
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@ -1,10 +1,11 @@
-(mm-processing)=
+---
 title: Multi-Modal Data Processing
 ---
 [](){ #mm-processing }
-# Multi-Modal Data Processing
+To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching][automatic-prefix-caching], we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
-To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
 Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
 ## Prompt Update Detection
@ -15,7 +16,7 @@ One of the main responsibilities of HF processor is to update the prompt with pl
 The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
-In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptUpdate` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
+In vLLM, this information is specified using [PromptUpdate][vllm.multimodal.processing.PromptUpdate] in [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
 ## Tokenized Prompt Inputs
@ -43,22 +44,22 @@ While HF processors support text + multi-modal inputs natively, this is not so f
 Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
-(mm-dummy-text)=
+[](){ #mm-dummy-text }
 ### Dummy text
-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
-(mm-automatic-prompt-updating)=
+[](){ #mm-automatic-prompt-updating }
 ### Automatic prompt updating
 We address the second issue by implementing model-agnostic code in
-{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates` to automatically update the prompt with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
+[_apply_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates] to automatically update the prompt with feature placeholder tokens based on the specification outputted by [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates].
 ### Summary
-With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
+With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in [_apply_hf_processor_main][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main].
 ## Processor Output Caching
@ -66,4 +67,4 @@ Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238)
 When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
-Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#mm-automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@ -2,14 +2,13 @@
 ## Debugging
-Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
+Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
 page for information on known issues and how to solve them.
 ## Introduction
-:::{important}
+!!! warning
-The source code references are to the state of the code at the time of writing in December, 2024.
+    The source code references are to the state of the code at the time of writing in December, 2024.
 :::
 The use of Python multiprocessing in vLLM is complicated by:
--- a/docs/source/design/plugin_system.md
+++ b/docs/source/design/plugin_system.md
@ -1,12 +1,13 @@
-(plugin-system)=
+---
-
+title: vLLM's Plugin System
-# vLLM's Plugin System
+---
 [](){ #plugin-system }
 The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
 ## How Plugins Work in vLLM
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview][arch-overview]), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
 ## How vLLM Discovers Plugins
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@ -57,7 +57,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
-These are documented under [Inferencing and Serving -> Production Metrics](project:../../serving/metrics.md).
+These are documented under [Inferencing and Serving -> Production Metrics](../../serving/metrics.md).
 ### Grafana Dashboard
@ -222,9 +222,7 @@ And the calculated intervals are:
 Put another way:
-:::{image} /assets/design/v1/metrics/intervals-1.png
+![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png)
 :alt: Interval calculations - common case
 :::
 We explored the possibility of having the frontend calculate these
 intervals using the timing of events visible by the frontend. However,
@ -239,17 +237,13 @@ When a preemption occurs during decode, since any already generated
 tokens are reused, we consider the preemption as affecting the
 inter-token, decode, and inference intervals.
-:::{image} /assets/design/v1/metrics/intervals-2.png
+![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png)
 :alt: Interval calculations - preempted decode
 :::
 When a preemption occurs during prefill (assuming such an event
 is possible), we consider the preemption as affecting the
 time-to-first-token and prefill intervals.
-:::{image} /assets/design/v1/metrics/intervals-3.png
+![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png)
 :alt: Interval calculations - preempted prefill
 :::
 ### Frontend Stats Collection
@ -467,7 +461,7 @@ In general:
   hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
   for some time before deleting them.
-See the [deprecation policy](project:../../contributing/deprecation_policy.md) for
+See the [deprecation policy](../../contributing/deprecation_policy.md) for
 the project-wide deprecation policy.
 ### Unimplemented - `vllm:tokens_total`
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@ -122,9 +122,7 @@ There are two design points to highlight:
 As a result, we will have the following components when the KV cache manager is initialized:
-:::{image} /assets/design/v1/prefix_caching/overview.png
+![Component Overview](../../assets/design/v1/prefix_caching/overview.png)
 :alt: Component Overview
 :::
 * Block Pool: A list of KVCacheBlock.  
 * Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
@ -194,9 +192,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund
 When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
-:::{image} /assets/design/v1/prefix_caching/free.png
+![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png)
 :alt: Free Queue after Free a Request
 :::
 ### Eviction (LRU)
@ -212,36 +208,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
-:::{image} /assets/design/v1/prefix_caching/example-time-1.png
+![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png)
 :alt: Example Time 1
 :::
 **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
-:::{image} /assets/design/v1/prefix_caching/example-time-3.png
+![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png)
 :alt: Example Time 3
 :::
 **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
-:::{image} /assets/design/v1/prefix_caching/example-time-4.png
+![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png)
 :alt: Example Time 4
 :::
 **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
-:::{image} /assets/design/v1/prefix_caching/example-time-5.png
+![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png)
 :alt: Example Time 5
 :::
 **Time 6: Request 1 is finished and free.**
-:::{image} /assets/design/v1/prefix_caching/example-time-6.png
+![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png)
 :alt: Example Time 6
 :::
 **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
-:::{image} /assets/design/v1/prefix_caching/example-time-7.png
+![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png)
 :alt: Example Time 7
 :::
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
--- a/docs/source/features/automatic_prefix_caching.md
+++ b/docs/source/features/automatic_prefix_caching.md
@ -1,14 +1,14 @@
-(automatic-prefix-caching)=
+---
-
+title: Automatic Prefix Caching
-# Automatic Prefix Caching
+---
 [](){ #automatic-prefix-caching }
 ## Introduction
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
-:::{note}
+!!! note
-Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
+    Technical details on how vLLM implements APC can be found [here][design-automatic-prefix-caching].
 :::
 ## Enabling APC in vLLM
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@ -0,0 +1,77 @@
 ---
 title: Compatibility Matrix
 ---
 [](){ #compatibility-matrix }
 The tables below show mutually exclusive features and the support on some hardware.
 The symbols used have the following meanings:
 - ✅ = Full compatibility
 - 🟠 = Partial compatibility
 - ❌ = No compatibility
 !!! note
    Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
 ## Feature x Feature
 <style>
 td:not(:first-child) {
  text-align: center !important;
 }
 td {
  padding: 0.5rem !important;
  white-space: nowrap;
 }
 th {
  padding: 0.5rem !important;
  min-width: 0 !important;
 }
 th:not(:first-child) {
  writing-mode: vertical-lr;
  transform: rotate(180deg)
 }
 </style>
 | Feature                                                   | [CP][chunked-prefill]   | [APC][automatic-prefix-caching]   | [LoRA][lora-adapter]   | <abbr title="Prompt Adapter">prmpt adptr</abbr>   | [SD][spec-decode]   | CUDA graph   | <abbr title="Pooling Models">pooling</abbr>   | <abbr title="Encoder-Decoder Models">enc-dec</abbr>   | <abbr title="Logprobs">logP</abbr>   | <abbr title="Prompt Logprobs">prmpt logP</abbr>   | <abbr title="Async Output Processing">async output</abbr>   | multi-step         | <abbr title="Multimodal Inputs">mm</abbr>   | best-of   | beam-search   |
 |-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------|
 | [CP][chunked-prefill]                                     | ✅                       |                                   |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
 | [APC][automatic-prefix-caching]                           | ✅                       | ✅                                 |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
 | [LoRA][lora-adapter]                                      | ✅                       | ✅                                 | ✅                      |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
 | <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
 | [SD][spec-decode]                                         | ✅                       | ✅                                 | ❌                      | ✅                                                 | ✅                   |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
 | CUDA graph                                                | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
 | <abbr title="Pooling Models">pooling</abbr>               | ❌                       | ❌                                 | ❌                      | ❌                                                 | ❌                   | ❌            | ✅                                             |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ❌                       | [❌](gh-issue:7366)                | ❌                      | ❌                                                 | [❌](gh-issue:7366)  | ✅            | ✅                                             | ✅                                                     |                                      |                                                   |                                                             |                    |                                             |           |               |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    |                                                   |                                                             |                    |                                             |           |               |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 |                                                             |                    |                                             |           |               |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                       | ✅                                 | ✅                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           |                    |                                             |           |               |
 | multi-step                                                | ❌                       | ✅                                 | ❌                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           | ✅                  |                                             |           |               |
 | <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                       | [🟠](gh-pr:8348)                   | [🟠](gh-pr:4194)        | ❔                                                 | ❔                   | ✅            | ✅                                             | ✅                                                     | ✅                                    | ✅                                                 | ✅                                                           | ❔                  | ✅                                           |           |               |
 | best-of                                                   | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ✅                                           | ✅         |               |
 | beam-search                                               | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ❔                                           | ✅         | ✅             |
 [](){ #feature-x-hardware }
 ## Feature x Hardware
 | Feature                                                   | Volta              | Turing   | Ampere   | Ada   | Hopper   | CPU                | AMD   |
 |-----------------------------------------------------------|--------------------|----------|----------|-------|----------|--------------------|-------|
 | [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | [APC][automatic-prefix-caching]                           | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | [LoRA][lora-adapter]                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8475) | ✅     |
 | [SD][spec-decode]                                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     |
 | <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     |
 | <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     |
 | multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     |
 | best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@ -1,12 +1,12 @@
-(disagg-prefill)=
+---
-
+title: Disaggregated Prefilling (experimental)
-# Disaggregated Prefilling (experimental)
+---
 [](){ #disagg-prefill }
 This page introduces you the disaggregated prefilling feature in vLLM.
-:::{note}
+!!! note
-This feature is experimental and subject to change.
+    This feature is experimental and subject to change.
 :::
 ## Why disaggregated prefilling?
@ -15,9 +15,8 @@ Two main reasons:
 - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
 - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
-:::{note}
+!!! note
-Disaggregated prefill DOES NOT improve throughput.
+    Disaggregated prefill DOES NOT improve throughput.
 :::
 ## Usage example
@ -39,21 +38,16 @@ Key abstractions for disaggregated prefilling:
 - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
 - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
-:::{note}
+!!! note
-`insert` is non-blocking operation but `drop_select` is blocking operation.
+    `insert` is non-blocking operation but `drop_select` is blocking operation.
 :::
 Here is a figure illustrating how the above 3 abstractions are organized:
-:::{image} /assets/features/disagg_prefill/abstraction.jpg
+![Disaggregated prefilling abstractions](../assets/features/disagg_prefill/abstraction.jpg)
 :alt: Disaggregated prefilling abstractions
 :::
 The workflow of disaggregated prefilling is as follows:
-:::{image} /assets/features/disagg_prefill/overview.jpg
+![Disaggregated prefilling workflow](../assets/features/disagg_prefill/overview.jpg)
 :alt: Disaggregated prefilling workflow
 :::
 The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
--- a/Show More
+++ b/Show More