From 8c853050e7da6a868d7b583a51b94592c86f2d19 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 24 Sep 2025 20:30:33 +0100
Subject: [PATCH] [Docs] Enable `fail_on_warning` for the docs build in CI
 (#25580)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .readthedocs.yaml                             |  1 +
 docs/features/nixl_connector_usage.md         |  8 ++--
 docs/mkdocs/hooks/generate_argparse.py        |  5 +-
 docs/models/generative_models.md              |  2 +-
 docs/models/supported_models.md               |  2 +-
 docs/usage/README.md                          |  2 +-
 .../dashboards/grafana/README.md              |  4 +-
 .../dashboards/perses/README.md               |  4 +-
 vllm/attention/ops/common.py                  | 36 +++++++-------
 vllm/inputs/data.py                           |  4 +-
 vllm/model_executor/layers/fused_moe/layer.py | 12 ++---
 .../model_loader/weight_utils.py              |  4 +-
 vllm/model_executor/models/qwen3_vl.py        | 20 ++++----
 vllm/model_executor/models/zamba2.py          |  5 +-
 vllm/reasoning/granite_reasoning_parser.py    | 10 ++--
 vllm/transformers_utils/configs/radio.py      | 47 +++++++------------
 vllm/transformers_utils/dynamic_module.py     |  2 +-
 vllm/v1/kv_offload/__init__.py                |  0
 vllm/v1/kv_offload/backends/__init__.py       |  0
 vllm/v1/kv_offload/worker/__init__.py         |  0
 20 files changed, 81 insertions(+), 87 deletions(-)
 create mode 100644 vllm/v1/kv_offload/__init__.py
 create mode 100644 vllm/v1/kv_offload/backends/__init__.py
 create mode 100644 vllm/v1/kv_offload/worker/__init__.py

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 4329750090..d83d6df35e 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -13,6 +13,7 @@ build:
 
 mkdocs:
   configuration: mkdocs.yaml
+  fail_on_warning: true
 
 # Optionally declare the Python requirements required to build your docs
 python:
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index de50f091df..afecbc8294 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 Install the NIXL library: `uv pip install nixl`, as a quick start.
 
 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
-- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
 
 ### Transport Configuration
 
@@ -154,6 +154,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
 
 Refer to these example scripts in the vLLM repository:
 
-- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
-- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
-- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
+- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 91454ec272..ac70980ac9 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -32,8 +32,9 @@ def auto_mock(module, attr, max_mocks=50):
     for _ in range(max_mocks):
         try:
             # First treat attr as an attr, then as a submodule
-            return getattr(importlib.import_module(module), attr,
-                           importlib.import_module(f"{module}.{attr}"))
+            with patch("importlib.metadata.version", return_value="0.0.0"):
+                return getattr(importlib.import_module(module), attr,
+                               importlib.import_module(f"{module}.{attr}"))
         except importlib.metadata.PackageNotFoundError as e:
             raise e
         except ModuleNotFoundError as e:
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index d02522a665..05f8d16cc4 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
 
 In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
-which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
+which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.
 
 ## Configuration
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 9d288667a3..81bd12f9a2 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -29,7 +29,7 @@ _*Vision-language models currently accept only image inputs. Support for video i
 
 If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
 
-- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
+- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
     - Pipeline parallel
     - Tensor parallel
diff --git a/docs/usage/README.md b/docs/usage/README.md
index 83aea12181..0c63d01f0f 100644
--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@@ -1,6 +1,6 @@
 # Using vLLM
 
-First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
+First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
 
 Then, vLLM supports the following usage patterns:
 
diff --git a/examples/online_serving/dashboards/grafana/README.md b/examples/online_serving/dashboards/grafana/README.md
index e42b0f8143..abe5f8cf23 100644
--- a/examples/online_serving/dashboards/grafana/README.md
+++ b/examples/online_serving/dashboards/grafana/README.md
@@ -11,9 +11,9 @@ vLLM performance and metrics.
 
 ## Dashboard Descriptions
 
-- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
+- **performance_statistics.json**: Tracks performance metrics including latency and
   throughput for your vLLM service.
-- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
+- **query_statistics.json**: Tracks query performance, request volume, and key
   performance indicators for your vLLM service.
 
 ## Deployment Options
diff --git a/examples/online_serving/dashboards/perses/README.md b/examples/online_serving/dashboards/perses/README.md
index ae04fd17b1..780a6ef13a 100644
--- a/examples/online_serving/dashboards/perses/README.md
+++ b/examples/online_serving/dashboards/perses/README.md
@@ -21,9 +21,9 @@ deployment methods:
 
 ## Dashboard Descriptions
 
-- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
+- **performance_statistics.yaml**: Performance metrics with aggregated latency
   statistics
-- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
+- **query_statistics.yaml**: Query performance and deployment metrics
 
 ## Deployment Options
 
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 6253e1e56b..c8efa6e63a 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -18,12 +18,14 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
     final attention output.
 
     Args:
-        output: [ B, H, D ]
-        lses   : [ N, B, H ]
-        cp, batch, q_heads, v_head_dim
-    Return:
-        output: [ B, H, D ]
-        lse   : [ B, H ]
+        outputs_ptr (triton.PointerType):
+            Pointer to input tensor of shape [ B, H, D ]
+        lses_ptr (triton.PointerType):
+            Pointer to input tensor of shape [ N, B, H ]
+        new_output_ptr (triton.PointerType):
+            Pointer to output tensor of shape [ B, H, D ]
+        vlse_ptr (triton.PointerType):
+            Pointer to output tensor of shape [ B, H ]
     """
     batch_idx = tl.program_id(axis=0).to(tl.int64)
     head_idx = tl.program_id(axis=1).to(tl.int64)
@@ -81,19 +83,19 @@ class CPTritonContext:
             self.inner_kernel[grid](*regular_args)
 
 
-def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
-                     ctx: CPTritonContext):
-    """
-    Apply the all-gathered lses to correct each local rank's attention
-    output. we still need perform a cross-rank reduction to obtain the
-    final attention output.
+def correct_attn_out(
+        out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
+        ctx: CPTritonContext) -> tuple[torch.Tensor, torch.Tensor]:
+    """Correct the attention output using the all-gathered lses.
 
     Args:
-        output: [ B, H, D ]
-        lses   : [ N, B, H ]
-    Return:
-        output: [ B, H, D ]
-        lse   : [ B, H ]
+        out: Tensor of shape [ B, H, D ]
+        lses: Tensor of shape [ N, B, H ]
+        cp_rank: Current rank in the context-parallel group
+        ctx: Triton context to avoid recompilation
+
+    Returns:
+        Tuple of (out, lse) with corrected attention and final log-sum-exp.
     """
     if ctx is None:
         ctx = CPTritonContext()
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 6a005aa634..1718c0767a 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -287,8 +287,8 @@ class EncoderDecoderInputs(TypedDict):
 
 SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be 
-passed to [`vllm.sequence.Sequence`][].
+A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be
+passed to [`Sequence`][collections.abc.Sequence].
 """
 
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 89e0cee081..b68190e5d1 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -57,7 +57,7 @@ else:
     FusedMoEPermuteExpertsUnpermute = None  # type: ignore
     FusedMoEPrepareAndFinalize = None  # type: ignore
 
-    def eplb_map_to_physical_and_record(
+    def _eplb_map_to_physical_and_record(
             topk_ids: torch.Tensor, expert_load_view: torch.Tensor,
             logical_to_physical_map: torch.Tensor,
             logical_replica_count: torch.Tensor,
@@ -65,6 +65,7 @@ else:
         # CPU fallback: no EPLB so just return as is
         return topk_ids
 
+    eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
 
 if is_rocm_aiter_moe_enabled():
     from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
@@ -807,12 +808,11 @@ def maybe_roundup_hidden_size(
     if necessary.
     
     Args:
-        hidden_size(int): Layer hidden-size
+        hidden_size: Layer hidden-size
         act_dtype: Data type of the layer activations.
-        quant_config(FusedMoEQuantConfig): Fused MoE quantization configuration.
-        moe_parallel_config(FusedMoEParallelConfig): Fused MoE parallelization
-            strategy configuration.
-    
+        quant_config: Fused MoE quantization configuration.
+        moe_parallel_config: Fused MoE parallelization strategy configuration.
+
     Return:
         Rounded up hidden_size if rounding up is required based on the configs.
         Original hidden size otherwise.
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a72086da18..cad32fee1d 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -13,7 +13,7 @@ from collections import defaultdict
 from collections.abc import Generator
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import IO, Any, Callable, Optional, Union
 
 import filelock
 import huggingface_hub.constants
@@ -102,7 +102,7 @@ def get_lock(model_name_or_path: Union[str, Path],
 @contextmanager
 def atomic_writer(filepath: Union[str, Path],
                   mode: str = 'w',
-                  encoding: Optional[str] = None):
+                  encoding: Optional[str] = None) -> Generator[IO]:
     """
     Context manager that provides an atomic file writing routine.
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index ee6703f722..d4f1547fd8 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1445,14 +1445,18 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 **NOTE**: If mrope is enabled (default setting for Qwen3VL
                 opensource models), the shape will be `(3, seq_len)`,
                 otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+            intermediate_tensors: Intermediate tensors from previous pipeline
+                stages.
+            inputs_embeds: Pre-computed input embeddings.
+            **kwargs: Additional keyword arguments including:
+                - pixel_values: Pixel values to be fed to a model.
+                    `None` if no images are passed.
+                - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
+                    LLM. `None` if no images are passed.
+                - pixel_values_videos: Pixel values of videos to be fed to a
+                    model. `None` if no videos are passed.
+                - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
+                    LLM. `None` if no videos are passed.
         """
 
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index a0d93045b7..1d68320bd9 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -944,11 +944,10 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         hidden_states: torch.Tensor,
     ) -> Optional[torch.Tensor]:
         """Compute logits for next token prediction.
-        
+
         Args:
             hidden_states: Hidden states from model forward pass
-            sampling_metadata: Metadata for sampling process
-            
+
         Returns:
             Logits for next token prediction
         """
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 5820001b91..212e14b092 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -278,11 +278,11 @@ class GraniteReasoningParser(ReasoningParser):
         content and normal (response) content.
 
         Args:
-            delta_text (str): Text to consider and parse content from.
-            reasoning_content (str): reasoning content from current_text.
-            response_content (str): response content from current_text.
-            current_text (str): The full previous + delta text.
-            response_seq_len(str): Len of the complete response sequence used.
+            delta_text: Text to consider and parse content from.
+            reasoning_content: reasoning content from current_text.
+            response_content: response content from current_text.
+            current_text: The full previous + delta text.
+            response_seq_len: Len of the complete response sequence used.
 
         Returns:
             DeltaMessage: Message containing the parsed content.
diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py
index 58ad7b8187..e1d96294d6 100644
--- a/vllm/transformers_utils/configs/radio.py
+++ b/vllm/transformers_utils/configs/radio.py
@@ -27,36 +27,23 @@ class RadioConfig(PretrainedConfig):
     specified arguments, defining the model architecture.
 
     Args:
-        model_name (`str`, *optional*, defaults to "vit_base_patch16_224"):
-            Name of the vision transformer model (e.g., "vit_base_patch16_224").
-            Used to determine architecture dimensions from
-            `VIT_TIMM_DIM_BY_NAME`.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        qkv_bias (`bool`, *optional*, defaults to True):
-            Whether to add a bias to the queries, keys and values.
-        qk_normalization (`bool`, *optional*, defaults to False):
-            Whether to apply normalization to queries and keys.
-        norm_type (`str`, *optional*, defaults to "layer_norm"):
-            The normalization type to use.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
-            The epsilon used by the layer normalization layers.
-        initializer_factor (`float`, *optional*, defaults to 1.0):
-            A factor for initializing all weight matrices.
-        hidden_act (`str`, *optional*, defaults to "gelu"):
-            The non-linear activation function in the encoder.
-        max_img_size (`int`, *optional*, defaults to 2048):
-            Maximum image size for position embeddings.
-        norm_mean (`tuple` or `list`, *optional*,
-            defaults to (0.48145466, 0.4578275, 0.40821073)):
-            Mean values for image normalization (RGB channels).
-        norm_std (`tuple` or `list`, *optional*,
-            defaults to (0.26862954, 0.26130258, 0.27577711)):
-            Standard deviation values for image normalization (RGB channels).
-        reg_tokens (`int`, *optional*):
-            Number of register tokens to use.
+        model_name: Name of the vision transformer model
+            (e.g., "vit_base_patch16_224"). Used to determine architecture
+            dimensions from `VIT_TIMM_DIM_BY_NAME`.
+        image_size: The size (resolution) of each image.
+        patch_size: The size (resolution) of each patch.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        qk_normalization: Whether to apply normalization to queries and keys.
+        norm_type: The normalization type to use.
+        layer_norm_eps: The epsilon used by the layer normalization layers.
+        initializer_factor: A factor for initializing all weight matrices.
+        hidden_act: The non-linear activation function in the encoder.
+        max_img_size: Maximum image size for position embeddings.
+        norm_mean: Mean values for image normalization (RGB channels).
+            Defaults to (0.48145466, 0.4578275, 0.40821073)).
+        norm_std: Standard deviation values for image normalization
+            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
+        reg_tokens: Number of register tokens to use.
     """
 
     model_type = "radio"
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index 05191f9521..3c273ad41d 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -27,7 +27,7 @@ def try_get_class_from_dynamic_module(
     **kwargs,
 ) -> Optional[type]:
     """
-    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
+    As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
     but ignoring any errors.
     """
     try:
diff --git a/vllm/v1/kv_offload/__init__.py b/vllm/v1/kv_offload/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vllm/v1/kv_offload/backends/__init__.py b/vllm/v1/kv_offload/backends/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vllm/v1/kv_offload/worker/__init__.py b/vllm/v1/kv_offload/worker/__init__.py
new file mode 100644
index 0000000000..e69de29bb2