From 8c853050e7da6a868d7b583a51b94592c86f2d19 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 24 Sep 2025 20:30:33 +0100 Subject: [PATCH] [Docs] Enable `fail_on_warning` for the docs build in CI (#25580) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .readthedocs.yaml | 1 + docs/features/nixl_connector_usage.md | 8 ++-- docs/mkdocs/hooks/generate_argparse.py | 5 +- docs/models/generative_models.md | 2 +- docs/models/supported_models.md | 2 +- docs/usage/README.md | 2 +- .../dashboards/grafana/README.md | 4 +- .../dashboards/perses/README.md | 4 +- vllm/attention/ops/common.py | 36 +++++++------- vllm/inputs/data.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 12 ++--- .../model_loader/weight_utils.py | 4 +- vllm/model_executor/models/qwen3_vl.py | 20 ++++---- vllm/model_executor/models/zamba2.py | 5 +- vllm/reasoning/granite_reasoning_parser.py | 10 ++-- vllm/transformers_utils/configs/radio.py | 47 +++++++------------ vllm/transformers_utils/dynamic_module.py | 2 +- vllm/v1/kv_offload/__init__.py | 0 vllm/v1/kv_offload/backends/__init__.py | 0 vllm/v1/kv_offload/worker/__init__.py | 0 20 files changed, 81 insertions(+), 87 deletions(-) create mode 100644 vllm/v1/kv_offload/__init__.py create mode 100644 vllm/v1/kv_offload/backends/__init__.py create mode 100644 vllm/v1/kv_offload/worker/__init__.py diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 4329750090..d83d6df35e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -13,6 +13,7 @@ build: mkdocs: configuration: mkdocs.yaml + fail_on_warning: true # Optionally declare the Python requirements required to build your docs python: diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index de50f091df..afecbc8294 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag Install the NIXL library: `uv pip install nixl`, as a quick start. - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions -- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files +- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files ### Transport Configuration @@ -154,6 +154,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ Refer to these example scripts in the vLLM repository: -- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) -- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py) -- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py) +- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) +- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py) +- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 91454ec272..ac70980ac9 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -32,8 +32,9 @@ def auto_mock(module, attr, max_mocks=50): for _ in range(max_mocks): try: # First treat attr as an attr, then as a submodule - return getattr(importlib.import_module(module), attr, - importlib.import_module(f"{module}.{attr}")) + with patch("importlib.metadata.version", return_value="0.0.0"): + return getattr(importlib.import_module(module), attr, + importlib.import_module(f"{module}.{attr}")) except importlib.metadata.PackageNotFoundError as e: raise e except ModuleNotFoundError as e: diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index d02522a665..05f8d16cc4 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface. Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, -which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text. +which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text. ## Configuration diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 9d288667a3..81bd12f9a2 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -29,7 +29,7 @@ _*Vision-language models currently accept only image inputs. Support for video i If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM: -- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature) +- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature) - Any combination of the following vLLM parallelisation schemes: - Pipeline parallel - Tensor parallel diff --git a/docs/usage/README.md b/docs/usage/README.md index 83aea12181..0c63d01f0f 100644 --- a/docs/usage/README.md +++ b/docs/usage/README.md @@ -1,6 +1,6 @@ # Using vLLM -First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment. +First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment. Then, vLLM supports the following usage patterns: diff --git a/examples/online_serving/dashboards/grafana/README.md b/examples/online_serving/dashboards/grafana/README.md index e42b0f8143..abe5f8cf23 100644 --- a/examples/online_serving/dashboards/grafana/README.md +++ b/examples/online_serving/dashboards/grafana/README.md @@ -11,9 +11,9 @@ vLLM performance and metrics. ## Dashboard Descriptions -- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and +- **performance_statistics.json**: Tracks performance metrics including latency and throughput for your vLLM service. -- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key +- **query_statistics.json**: Tracks query performance, request volume, and key performance indicators for your vLLM service. ## Deployment Options diff --git a/examples/online_serving/dashboards/perses/README.md b/examples/online_serving/dashboards/perses/README.md index ae04fd17b1..780a6ef13a 100644 --- a/examples/online_serving/dashboards/perses/README.md +++ b/examples/online_serving/dashboards/perses/README.md @@ -21,9 +21,9 @@ deployment methods: ## Dashboard Descriptions -- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency +- **performance_statistics.yaml**: Performance metrics with aggregated latency statistics -- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics +- **query_statistics.yaml**: Query performance and deployment metrics ## Deployment Options diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 6253e1e56b..c8efa6e63a 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -18,12 +18,14 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr, final attention output. Args: - output: [ B, H, D ] - lses : [ N, B, H ] - cp, batch, q_heads, v_head_dim - Return: - output: [ B, H, D ] - lse : [ B, H ] + outputs_ptr (triton.PointerType): + Pointer to input tensor of shape [ B, H, D ] + lses_ptr (triton.PointerType): + Pointer to input tensor of shape [ N, B, H ] + new_output_ptr (triton.PointerType): + Pointer to output tensor of shape [ B, H, D ] + vlse_ptr (triton.PointerType): + Pointer to output tensor of shape [ B, H ] """ batch_idx = tl.program_id(axis=0).to(tl.int64) head_idx = tl.program_id(axis=1).to(tl.int64) @@ -81,19 +83,19 @@ class CPTritonContext: self.inner_kernel[grid](*regular_args) -def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int, - ctx: CPTritonContext): - """ - Apply the all-gathered lses to correct each local rank's attention - output. we still need perform a cross-rank reduction to obtain the - final attention output. +def correct_attn_out( + out: torch.Tensor, lses: torch.Tensor, cp_rank: int, + ctx: CPTritonContext) -> tuple[torch.Tensor, torch.Tensor]: + """Correct the attention output using the all-gathered lses. Args: - output: [ B, H, D ] - lses : [ N, B, H ] - Return: - output: [ B, H, D ] - lse : [ B, H ] + out: Tensor of shape [ B, H, D ] + lses: Tensor of shape [ N, B, H ] + cp_rank: Current rank in the context-parallel group + ctx: Triton context to avoid recompilation + + Returns: + Tuple of (out, lse) with corrected attention and final log-sum-exp. """ if ctx is None: ctx = CPTritonContext() diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 6a005aa634..1718c0767a 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -287,8 +287,8 @@ class EncoderDecoderInputs(TypedDict): SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ -A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be -passed to [`vllm.sequence.Sequence`][]. +A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be +passed to [`Sequence`][collections.abc.Sequence]. """ ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 89e0cee081..b68190e5d1 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -57,7 +57,7 @@ else: FusedMoEPermuteExpertsUnpermute = None # type: ignore FusedMoEPrepareAndFinalize = None # type: ignore - def eplb_map_to_physical_and_record( + def _eplb_map_to_physical_and_record( topk_ids: torch.Tensor, expert_load_view: torch.Tensor, logical_to_physical_map: torch.Tensor, logical_replica_count: torch.Tensor, @@ -65,6 +65,7 @@ else: # CPU fallback: no EPLB so just return as is return topk_ids + eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 @@ -807,12 +808,11 @@ def maybe_roundup_hidden_size( if necessary. Args: - hidden_size(int): Layer hidden-size + hidden_size: Layer hidden-size act_dtype: Data type of the layer activations. - quant_config(FusedMoEQuantConfig): Fused MoE quantization configuration. - moe_parallel_config(FusedMoEParallelConfig): Fused MoE parallelization - strategy configuration. - + quant_config: Fused MoE quantization configuration. + moe_parallel_config: Fused MoE parallelization strategy configuration. + Return: Rounded up hidden_size if rounding up is required based on the configs. Original hidden size otherwise. diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index a72086da18..cad32fee1d 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -13,7 +13,7 @@ from collections import defaultdict from collections.abc import Generator from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Optional, Union +from typing import IO, Any, Callable, Optional, Union import filelock import huggingface_hub.constants @@ -102,7 +102,7 @@ def get_lock(model_name_or_path: Union[str, Path], @contextmanager def atomic_writer(filepath: Union[str, Path], mode: str = 'w', - encoding: Optional[str] = None): + encoding: Optional[str] = None) -> Generator[IO]: """ Context manager that provides an atomic file writing routine. diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index ee6703f722..d4f1547fd8 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1445,14 +1445,18 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, **NOTE**: If mrope is enabled (default setting for Qwen3VL opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. + intermediate_tensors: Intermediate tensors from previous pipeline + stages. + inputs_embeds: Pre-computed input embeddings. + **kwargs: Additional keyword arguments including: + - pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in + LLM. `None` if no images are passed. + - pixel_values_videos: Pixel values of videos to be fed to a + model. `None` if no videos are passed. + - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in + LLM. `None` if no videos are passed. """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index a0d93045b7..1d68320bd9 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -944,11 +944,10 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): hidden_states: torch.Tensor, ) -> Optional[torch.Tensor]: """Compute logits for next token prediction. - + Args: hidden_states: Hidden states from model forward pass - sampling_metadata: Metadata for sampling process - + Returns: Logits for next token prediction """ diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 5820001b91..212e14b092 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -278,11 +278,11 @@ class GraniteReasoningParser(ReasoningParser): content and normal (response) content. Args: - delta_text (str): Text to consider and parse content from. - reasoning_content (str): reasoning content from current_text. - response_content (str): response content from current_text. - current_text (str): The full previous + delta text. - response_seq_len(str): Len of the complete response sequence used. + delta_text: Text to consider and parse content from. + reasoning_content: reasoning content from current_text. + response_content: response content from current_text. + current_text: The full previous + delta text. + response_seq_len: Len of the complete response sequence used. Returns: DeltaMessage: Message containing the parsed content. diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py index 58ad7b8187..e1d96294d6 100644 --- a/vllm/transformers_utils/configs/radio.py +++ b/vllm/transformers_utils/configs/radio.py @@ -27,36 +27,23 @@ class RadioConfig(PretrainedConfig): specified arguments, defining the model architecture. Args: - model_name (`str`, *optional*, defaults to "vit_base_patch16_224"): - Name of the vision transformer model (e.g., "vit_base_patch16_224"). - Used to determine architecture dimensions from - `VIT_TIMM_DIM_BY_NAME`. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. - qkv_bias (`bool`, *optional*, defaults to True): - Whether to add a bias to the queries, keys and values. - qk_normalization (`bool`, *optional*, defaults to False): - Whether to apply normalization to queries and keys. - norm_type (`str`, *optional*, defaults to "layer_norm"): - The normalization type to use. - layer_norm_eps (`float`, *optional*, defaults to 1e-6): - The epsilon used by the layer normalization layers. - initializer_factor (`float`, *optional*, defaults to 1.0): - A factor for initializing all weight matrices. - hidden_act (`str`, *optional*, defaults to "gelu"): - The non-linear activation function in the encoder. - max_img_size (`int`, *optional*, defaults to 2048): - Maximum image size for position embeddings. - norm_mean (`tuple` or `list`, *optional*, - defaults to (0.48145466, 0.4578275, 0.40821073)): - Mean values for image normalization (RGB channels). - norm_std (`tuple` or `list`, *optional*, - defaults to (0.26862954, 0.26130258, 0.27577711)): - Standard deviation values for image normalization (RGB channels). - reg_tokens (`int`, *optional*): - Number of register tokens to use. + model_name: Name of the vision transformer model + (e.g., "vit_base_patch16_224"). Used to determine architecture + dimensions from `VIT_TIMM_DIM_BY_NAME`. + image_size: The size (resolution) of each image. + patch_size: The size (resolution) of each patch. + qkv_bias: Whether to add a bias to the queries, keys and values. + qk_normalization: Whether to apply normalization to queries and keys. + norm_type: The normalization type to use. + layer_norm_eps: The epsilon used by the layer normalization layers. + initializer_factor: A factor for initializing all weight matrices. + hidden_act: The non-linear activation function in the encoder. + max_img_size: Maximum image size for position embeddings. + norm_mean: Mean values for image normalization (RGB channels). + Defaults to (0.48145466, 0.4578275, 0.40821073)). + norm_std: Standard deviation values for image normalization + (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)). + reg_tokens: Number of register tokens to use. """ model_type = "radio" diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py index 05191f9521..3c273ad41d 100644 --- a/vllm/transformers_utils/dynamic_module.py +++ b/vllm/transformers_utils/dynamic_module.py @@ -27,7 +27,7 @@ def try_get_class_from_dynamic_module( **kwargs, ) -> Optional[type]: """ - As [transformers.dynamic_module_utils.get_class_from_dynamic_module][], + As `transformers.dynamic_module_utils.get_class_from_dynamic_module`, but ignoring any errors. """ try: diff --git a/vllm/v1/kv_offload/__init__.py b/vllm/v1/kv_offload/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm/v1/kv_offload/backends/__init__.py b/vllm/v1/kv_offload/backends/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm/v1/kv_offload/worker/__init__.py b/vllm/v1/kv_offload/worker/__init__.py new file mode 100644 index 0000000000..e69de29bb2