mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Docs] Enable fail_on_warning
for the docs build in CI (#25580)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -13,6 +13,7 @@ build:
|
||||
|
||||
mkdocs:
|
||||
configuration: mkdocs.yaml
|
||||
fail_on_warning: true
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
|
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
|
||||
Install the NIXL library: `uv pip install nixl`, as a quick start.
|
||||
|
||||
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
|
||||
|
||||
### Transport Configuration
|
||||
|
||||
@ -154,6 +154,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
||||
|
||||
Refer to these example scripts in the vLLM repository:
|
||||
|
||||
- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
|
@ -32,8 +32,9 @@ def auto_mock(module, attr, max_mocks=50):
|
||||
for _ in range(max_mocks):
|
||||
try:
|
||||
# First treat attr as an attr, then as a submodule
|
||||
return getattr(importlib.import_module(module), attr,
|
||||
importlib.import_module(f"{module}.{attr}"))
|
||||
with patch("importlib.metadata.version", return_value="0.0.0"):
|
||||
return getattr(importlib.import_module(module), attr,
|
||||
importlib.import_module(f"{module}.{attr}"))
|
||||
except importlib.metadata.PackageNotFoundError as e:
|
||||
raise e
|
||||
except ModuleNotFoundError as e:
|
||||
|
@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
|
||||
|
||||
In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
|
||||
Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
|
||||
which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
|
||||
which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.
|
||||
|
||||
## Configuration
|
||||
|
||||
|
@ -29,7 +29,7 @@ _*Vision-language models currently accept only image inputs. Support for video i
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
|
||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
- Pipeline parallel
|
||||
- Tensor parallel
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Using vLLM
|
||||
|
||||
First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
|
||||
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
|
||||
|
||||
Then, vLLM supports the following usage patterns:
|
||||
|
||||
|
@ -11,9 +11,9 @@ vLLM performance and metrics.
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
|
||||
- **performance_statistics.json**: Tracks performance metrics including latency and
|
||||
throughput for your vLLM service.
|
||||
- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
|
||||
- **query_statistics.json**: Tracks query performance, request volume, and key
|
||||
performance indicators for your vLLM service.
|
||||
|
||||
## Deployment Options
|
||||
|
@ -21,9 +21,9 @@ deployment methods:
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
|
||||
- **performance_statistics.yaml**: Performance metrics with aggregated latency
|
||||
statistics
|
||||
- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
|
||||
- **query_statistics.yaml**: Query performance and deployment metrics
|
||||
|
||||
## Deployment Options
|
||||
|
||||
|
@ -18,12 +18,14 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
|
||||
final attention output.
|
||||
|
||||
Args:
|
||||
output: [ B, H, D ]
|
||||
lses : [ N, B, H ]
|
||||
cp, batch, q_heads, v_head_dim
|
||||
Return:
|
||||
output: [ B, H, D ]
|
||||
lse : [ B, H ]
|
||||
outputs_ptr (triton.PointerType):
|
||||
Pointer to input tensor of shape [ B, H, D ]
|
||||
lses_ptr (triton.PointerType):
|
||||
Pointer to input tensor of shape [ N, B, H ]
|
||||
new_output_ptr (triton.PointerType):
|
||||
Pointer to output tensor of shape [ B, H, D ]
|
||||
vlse_ptr (triton.PointerType):
|
||||
Pointer to output tensor of shape [ B, H ]
|
||||
"""
|
||||
batch_idx = tl.program_id(axis=0).to(tl.int64)
|
||||
head_idx = tl.program_id(axis=1).to(tl.int64)
|
||||
@ -81,19 +83,19 @@ class CPTritonContext:
|
||||
self.inner_kernel[grid](*regular_args)
|
||||
|
||||
|
||||
def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
|
||||
ctx: CPTritonContext):
|
||||
"""
|
||||
Apply the all-gathered lses to correct each local rank's attention
|
||||
output. we still need perform a cross-rank reduction to obtain the
|
||||
final attention output.
|
||||
def correct_attn_out(
|
||||
out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
|
||||
ctx: CPTritonContext) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Correct the attention output using the all-gathered lses.
|
||||
|
||||
Args:
|
||||
output: [ B, H, D ]
|
||||
lses : [ N, B, H ]
|
||||
Return:
|
||||
output: [ B, H, D ]
|
||||
lse : [ B, H ]
|
||||
out: Tensor of shape [ B, H, D ]
|
||||
lses: Tensor of shape [ N, B, H ]
|
||||
cp_rank: Current rank in the context-parallel group
|
||||
ctx: Triton context to avoid recompilation
|
||||
|
||||
Returns:
|
||||
Tuple of (out, lse) with corrected attention and final log-sum-exp.
|
||||
"""
|
||||
if ctx is None:
|
||||
ctx = CPTritonContext()
|
||||
|
@ -288,7 +288,7 @@ class EncoderDecoderInputs(TypedDict):
|
||||
SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
|
||||
"""
|
||||
A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be
|
||||
passed to [`vllm.sequence.Sequence`][].
|
||||
passed to [`Sequence`][collections.abc.Sequence].
|
||||
"""
|
||||
|
||||
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
|
||||
|
@ -57,7 +57,7 @@ else:
|
||||
FusedMoEPermuteExpertsUnpermute = None # type: ignore
|
||||
FusedMoEPrepareAndFinalize = None # type: ignore
|
||||
|
||||
def eplb_map_to_physical_and_record(
|
||||
def _eplb_map_to_physical_and_record(
|
||||
topk_ids: torch.Tensor, expert_load_view: torch.Tensor,
|
||||
logical_to_physical_map: torch.Tensor,
|
||||
logical_replica_count: torch.Tensor,
|
||||
@ -65,6 +65,7 @@ else:
|
||||
# CPU fallback: no EPLB so just return as is
|
||||
return topk_ids
|
||||
|
||||
eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
|
||||
|
||||
if is_rocm_aiter_moe_enabled():
|
||||
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
|
||||
@ -807,11 +808,10 @@ def maybe_roundup_hidden_size(
|
||||
if necessary.
|
||||
|
||||
Args:
|
||||
hidden_size(int): Layer hidden-size
|
||||
hidden_size: Layer hidden-size
|
||||
act_dtype: Data type of the layer activations.
|
||||
quant_config(FusedMoEQuantConfig): Fused MoE quantization configuration.
|
||||
moe_parallel_config(FusedMoEParallelConfig): Fused MoE parallelization
|
||||
strategy configuration.
|
||||
quant_config: Fused MoE quantization configuration.
|
||||
moe_parallel_config: Fused MoE parallelization strategy configuration.
|
||||
|
||||
Return:
|
||||
Rounded up hidden_size if rounding up is required based on the configs.
|
||||
|
@ -13,7 +13,7 @@ from collections import defaultdict
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional, Union
|
||||
from typing import IO, Any, Callable, Optional, Union
|
||||
|
||||
import filelock
|
||||
import huggingface_hub.constants
|
||||
@ -102,7 +102,7 @@ def get_lock(model_name_or_path: Union[str, Path],
|
||||
@contextmanager
|
||||
def atomic_writer(filepath: Union[str, Path],
|
||||
mode: str = 'w',
|
||||
encoding: Optional[str] = None):
|
||||
encoding: Optional[str] = None) -> Generator[IO]:
|
||||
"""
|
||||
Context manager that provides an atomic file writing routine.
|
||||
|
||||
|
@ -1445,14 +1445,18 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
**NOTE**: If mrope is enabled (default setting for Qwen3VL
|
||||
opensource models), the shape will be `(3, seq_len)`,
|
||||
otherwise it will be `(seq_len,).
|
||||
pixel_values: Pixel values to be fed to a model.
|
||||
`None` if no images are passed.
|
||||
image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
|
||||
`None` if no images are passed.
|
||||
pixel_values_videos: Pixel values of videos to be fed to a model.
|
||||
`None` if no videos are passed.
|
||||
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
|
||||
`None` if no videos are passed.
|
||||
intermediate_tensors: Intermediate tensors from previous pipeline
|
||||
stages.
|
||||
inputs_embeds: Pre-computed input embeddings.
|
||||
**kwargs: Additional keyword arguments including:
|
||||
- pixel_values: Pixel values to be fed to a model.
|
||||
`None` if no images are passed.
|
||||
- image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
|
||||
LLM. `None` if no images are passed.
|
||||
- pixel_values_videos: Pixel values of videos to be fed to a
|
||||
model. `None` if no videos are passed.
|
||||
- video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
|
||||
LLM. `None` if no videos are passed.
|
||||
"""
|
||||
|
||||
if intermediate_tensors is not None:
|
||||
|
@ -947,7 +947,6 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
|
||||
|
||||
Args:
|
||||
hidden_states: Hidden states from model forward pass
|
||||
sampling_metadata: Metadata for sampling process
|
||||
|
||||
Returns:
|
||||
Logits for next token prediction
|
||||
|
@ -278,11 +278,11 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
content and normal (response) content.
|
||||
|
||||
Args:
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
reasoning_content (str): reasoning content from current_text.
|
||||
response_content (str): response content from current_text.
|
||||
current_text (str): The full previous + delta text.
|
||||
response_seq_len(str): Len of the complete response sequence used.
|
||||
delta_text: Text to consider and parse content from.
|
||||
reasoning_content: reasoning content from current_text.
|
||||
response_content: response content from current_text.
|
||||
current_text: The full previous + delta text.
|
||||
response_seq_len: Len of the complete response sequence used.
|
||||
|
||||
Returns:
|
||||
DeltaMessage: Message containing the parsed content.
|
||||
|
@ -27,36 +27,23 @@ class RadioConfig(PretrainedConfig):
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Args:
|
||||
model_name (`str`, *optional*, defaults to "vit_base_patch16_224"):
|
||||
Name of the vision transformer model (e.g., "vit_base_patch16_224").
|
||||
Used to determine architecture dimensions from
|
||||
`VIT_TIMM_DIM_BY_NAME`.
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to 16):
|
||||
The size (resolution) of each patch.
|
||||
qkv_bias (`bool`, *optional*, defaults to True):
|
||||
Whether to add a bias to the queries, keys and values.
|
||||
qk_normalization (`bool`, *optional*, defaults to False):
|
||||
Whether to apply normalization to queries and keys.
|
||||
norm_type (`str`, *optional*, defaults to "layer_norm"):
|
||||
The normalization type to use.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
||||
The epsilon used by the layer normalization layers.
|
||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||
A factor for initializing all weight matrices.
|
||||
hidden_act (`str`, *optional*, defaults to "gelu"):
|
||||
The non-linear activation function in the encoder.
|
||||
max_img_size (`int`, *optional*, defaults to 2048):
|
||||
Maximum image size for position embeddings.
|
||||
norm_mean (`tuple` or `list`, *optional*,
|
||||
defaults to (0.48145466, 0.4578275, 0.40821073)):
|
||||
Mean values for image normalization (RGB channels).
|
||||
norm_std (`tuple` or `list`, *optional*,
|
||||
defaults to (0.26862954, 0.26130258, 0.27577711)):
|
||||
Standard deviation values for image normalization (RGB channels).
|
||||
reg_tokens (`int`, *optional*):
|
||||
Number of register tokens to use.
|
||||
model_name: Name of the vision transformer model
|
||||
(e.g., "vit_base_patch16_224"). Used to determine architecture
|
||||
dimensions from `VIT_TIMM_DIM_BY_NAME`.
|
||||
image_size: The size (resolution) of each image.
|
||||
patch_size: The size (resolution) of each patch.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
qk_normalization: Whether to apply normalization to queries and keys.
|
||||
norm_type: The normalization type to use.
|
||||
layer_norm_eps: The epsilon used by the layer normalization layers.
|
||||
initializer_factor: A factor for initializing all weight matrices.
|
||||
hidden_act: The non-linear activation function in the encoder.
|
||||
max_img_size: Maximum image size for position embeddings.
|
||||
norm_mean: Mean values for image normalization (RGB channels).
|
||||
Defaults to (0.48145466, 0.4578275, 0.40821073)).
|
||||
norm_std: Standard deviation values for image normalization
|
||||
(RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
|
||||
reg_tokens: Number of register tokens to use.
|
||||
"""
|
||||
|
||||
model_type = "radio"
|
||||
|
@ -27,7 +27,7 @@ def try_get_class_from_dynamic_module(
|
||||
**kwargs,
|
||||
) -> Optional[type]:
|
||||
"""
|
||||
As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
|
||||
As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
|
||||
but ignoring any errors.
|
||||
"""
|
||||
try:
|
||||
|
0
vllm/v1/kv_offload/__init__.py
Normal file
0
vllm/v1/kv_offload/__init__.py
Normal file
0
vllm/v1/kv_offload/backends/__init__.py
Normal file
0
vllm/v1/kv_offload/backends/__init__.py
Normal file
0
vllm/v1/kv_offload/worker/__init__.py
Normal file
0
vllm/v1/kv_offload/worker/__init__.py
Normal file
Reference in New Issue
Block a user