mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Docs] Replace rst
style double-backtick with md
single-backtick (#27091)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -1251,7 +1251,7 @@ async def main() -> None:
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ",
|
||||
"same as the `--model` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
@ -3,4 +3,4 @@ Loading Model weights with fastsafetensors
|
||||
|
||||
Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
|
||||
|
||||
To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
|
||||
To enable this feature, use the `--load-format fastsafetensors` command-line argument
|
||||
|
@ -67,17 +67,17 @@ class _HfExamplesInfo:
|
||||
|
||||
is_available_online: bool = True
|
||||
"""
|
||||
Set this to ``False`` if the name of this architecture no longer exists on
|
||||
Set this to `False` if the name of this architecture no longer exists on
|
||||
the HF repo. To maintain backwards compatibility, we have not removed them
|
||||
from the main model registry, so without this flag the registry tests will
|
||||
fail.
|
||||
"""
|
||||
|
||||
trust_remote_code: bool = False
|
||||
"""The ``trust_remote_code`` level required to load the model."""
|
||||
"""The `trust_remote_code` level required to load the model."""
|
||||
|
||||
hf_overrides: dict[str, Any] = field(default_factory=dict)
|
||||
"""The ``hf_overrides`` required to load the model."""
|
||||
"""The `hf_overrides` required to load the model."""
|
||||
|
||||
max_model_len: int | None = None
|
||||
"""
|
||||
|
@ -162,7 +162,7 @@ def check_logprobs_close(
|
||||
|
||||
# Test prompt logprobs closeness
|
||||
if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
|
||||
# Both sequences' prompt logprobs lists are not `None``
|
||||
# Both sequences' prompt logprobs lists are not `None`
|
||||
# (although individual list elements may be `None`);
|
||||
# for each token's logprobs:
|
||||
for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Ensure we perform lazy loading in vllm/__init__.py.
|
||||
i.e: appears only within the ``if typing.TYPE_CHECKING:`` guard,
|
||||
i.e: appears only within the `if typing.TYPE_CHECKING:` guard,
|
||||
**except** for a short whitelist.
|
||||
"""
|
||||
|
||||
|
@ -21,7 +21,7 @@ def get_cache_dir() -> Path:
|
||||
@lru_cache
|
||||
def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path:
|
||||
"""
|
||||
Download an asset file from ``s3://vllm-public-assets``
|
||||
Download an asset file from `s3://vllm-public-assets`
|
||||
and return the path to the downloaded file.
|
||||
"""
|
||||
asset_directory = get_cache_dir() / "vllm_public_assets"
|
||||
|
@ -1231,7 +1231,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ",
|
||||
"same as the `--model` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
@ -138,8 +138,8 @@ def support_torch_compile(
|
||||
"""
|
||||
|
||||
def cls_decorator_helper(cls: _T) -> _T:
|
||||
# helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
|
||||
# to avoid too much indentation for `_support_torch_compile``
|
||||
# helper to pass `dynamic_arg_dims` to `_support_torch_compile`
|
||||
# to avoid too much indentation for `_support_torch_compile`
|
||||
if not hasattr(cls, "forward"):
|
||||
raise TypeError("decorated class should have a forward method.")
|
||||
sig = inspect.signature(cls.forward)
|
||||
|
@ -66,15 +66,15 @@ class PoolerConfig:
|
||||
"""
|
||||
step_tag_id: int | None = None
|
||||
"""
|
||||
If set, only the score corresponding to the ``step_tag_id`` in the
|
||||
If set, only the score corresponding to the `step_tag_id` in the
|
||||
generated sentence should be returned. Otherwise, the scores for all tokens
|
||||
are returned.
|
||||
"""
|
||||
returned_token_ids: list[int] | None = None
|
||||
"""
|
||||
A list of indices for the vocabulary dimensions to be extracted,
|
||||
such as the token IDs of ``good_token`` and ``bad_token`` in the
|
||||
``math-shepherd-mistral-7b-prm`` model.
|
||||
such as the token IDs of `good_token` and `bad_token` in the
|
||||
`math-shepherd-mistral-7b-prm` model.
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
|
@ -117,7 +117,7 @@ class ZmqEventPublisher(EventPublisher):
|
||||
Parameters
|
||||
----------
|
||||
endpoint:
|
||||
PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to
|
||||
PUB address. Use `tcp://*:5557` to bind or `tcp://host:5557` to
|
||||
connect.
|
||||
replay_endpoint:
|
||||
Optional ROUTER address for replay requests. When given, subscribers can
|
||||
|
@ -515,7 +515,7 @@ class StreamingHarmonyContext(HarmonyContext):
|
||||
|
||||
def render_for_completion(self) -> list[int]:
|
||||
# now this list of tokens as next turn's starting tokens
|
||||
# `<|start|>assistant``,
|
||||
# `<|start|>assistant`,
|
||||
# we need to process them in parser.
|
||||
rendered_tokens = super().render_for_completion()
|
||||
|
||||
|
@ -1504,7 +1504,7 @@ class LLM:
|
||||
"""Return a snapshot of aggregated metrics from Prometheus.
|
||||
|
||||
Returns:
|
||||
A ``MetricSnapshot`` instance capturing the current state
|
||||
A `MetricSnapshot` instance capturing the current state
|
||||
of all aggregated metrics from Prometheus.
|
||||
|
||||
Note:
|
||||
|
@ -26,12 +26,12 @@ class RenderConfig:
|
||||
|
||||
max_length: int | None = None
|
||||
"""Maximum allowable total input token length. If provided,
|
||||
token inputs longer than this raise ``ValueError``."""
|
||||
token inputs longer than this raise `ValueError`."""
|
||||
|
||||
truncate_prompt_tokens: int | None = None
|
||||
"""Number of tokens to keep. ``None`` means no truncation.
|
||||
``0`` yields an empty list (and skips embeds).
|
||||
``-1`` maps to ``model_config.max_model_len``."""
|
||||
"""Number of tokens to keep. `None` means no truncation.
|
||||
`0` yields an empty list (and skips embeds).
|
||||
`-1` maps to `model_config.max_model_len`."""
|
||||
|
||||
add_special_tokens: bool | None = True
|
||||
"""Whether to add model-specific special tokens during tokenization."""
|
||||
@ -107,10 +107,10 @@ class BaseRenderer(ABC):
|
||||
|
||||
Args:
|
||||
prompt_or_prompts: One of:
|
||||
- ``str``: Single text prompt.
|
||||
- ``list[str]``: Batch of text prompts.
|
||||
- ``list[int]``: Single pre-tokenized sequence.
|
||||
- ``list[list[int]]``: Batch of pre-tokenized sequences.
|
||||
- `str`: Single text prompt.
|
||||
- `list[str]`: Batch of text prompts.
|
||||
- `list[int]`: Single pre-tokenized sequence.
|
||||
- `list[list[int]]`: Batch of pre-tokenized sequences.
|
||||
config: Render configuration controlling how prompts are prepared
|
||||
(e.g., tokenization and length handling).
|
||||
|
||||
@ -134,9 +134,9 @@ class BaseRenderer(ABC):
|
||||
Convert text/token and/or base64-encoded embeddings inputs into
|
||||
engine-ready prompt objects using a unified RenderConfig.
|
||||
|
||||
At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be
|
||||
At least one of `prompt_or_prompts` or `prompt_embeds` must be
|
||||
provided and non-empty. If both are omitted or empty (e.g., empty
|
||||
string and empty list), a ``ValueError`` is raised.
|
||||
string and empty list), a `ValueError` is raised.
|
||||
|
||||
Args:
|
||||
prompt_or_prompts: Text or token inputs to include.
|
||||
@ -150,7 +150,7 @@ class BaseRenderer(ABC):
|
||||
Engine-ready prompt objects.
|
||||
|
||||
Raises:
|
||||
ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds``
|
||||
ValueError: If both `prompt_or_prompts` and `prompt_embeds`
|
||||
are omitted or empty (decoder prompt cannot be empty), or if
|
||||
length limits are exceeded.
|
||||
"""
|
||||
|
@ -327,7 +327,7 @@ def zip_enc_dec_prompts(
|
||||
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
|
||||
instances.
|
||||
|
||||
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
|
||||
`mm_processor_kwargs` may also be provided; if a dict is passed, the same
|
||||
dictionary will be used for every encoder/decoder prompt. If an iterable is
|
||||
provided, it will be zipped with the encoder/decoder prompts.
|
||||
"""
|
||||
|
@ -27,7 +27,7 @@ __all__ = [
|
||||
|
||||
|
||||
def is_flashinfer_fp4_cutlass_moe_available() -> bool:
|
||||
"""Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
|
||||
"""Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
|
||||
return (
|
||||
envs.VLLM_USE_FLASHINFER_MOE_FP4
|
||||
and has_flashinfer_cutlass_fused_moe()
|
||||
|
@ -887,11 +887,11 @@ def requant_weight_ue8m0_inplace(
|
||||
UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace.
|
||||
|
||||
Args:
|
||||
weight: Block-quantised weight tensor stored in ``torch.float8_e4m3fn``.
|
||||
Expected shape ``(..., M, K)``.
|
||||
weight_scale: Corresponding per-block scale tensor (``torch.float32``)
|
||||
with shape ``(..., M // block_size[0], K // block_size[1])``.
|
||||
block_size: 2-element iterable ``[block_m, block_k]`` describing the
|
||||
weight: Block-quantised weight tensor stored in `torch.float8_e4m3fn`.
|
||||
Expected shape `(..., M, K)`.
|
||||
weight_scale: Corresponding per-block scale tensor (`torch.float32`)
|
||||
with shape `(..., M // block_size[0], K // block_size[1])`.
|
||||
block_size: 2-element iterable `[block_m, block_k]` describing the
|
||||
block quantisation granularity.
|
||||
"""
|
||||
if weight.numel() == 0:
|
||||
|
@ -64,7 +64,7 @@ from .utils import (
|
||||
class OlmoAttention(nn.Module):
|
||||
"""
|
||||
This is the attention block where the output is computed as
|
||||
``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
||||
`Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
@ -144,7 +144,7 @@ class OlmoAttention(nn.Module):
|
||||
class OlmoMLP(nn.Module):
|
||||
"""
|
||||
This is the MLP block where the output is computed as
|
||||
``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
||||
`MLP(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
@ -193,7 +193,7 @@ class OlmoMLP(nn.Module):
|
||||
class OlmoDecoderLayer(nn.Module):
|
||||
"""
|
||||
This is a typical transformer block where the output is
|
||||
computed as ``MLP(LN(x + Attention(LN(x))))``
|
||||
computed as `MLP(LN(x + Attention(LN(x))))`
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
|
@ -69,7 +69,7 @@ from vllm.transformers_utils.configs import Olmo3Config
|
||||
class Olmo2Attention(nn.Module):
|
||||
"""
|
||||
This is the attention block where the output is computed as
|
||||
``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
||||
`Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
@ -190,7 +190,7 @@ class Olmo2Attention(nn.Module):
|
||||
class Olmo2MLP(nn.Module):
|
||||
"""
|
||||
This is the MLP block where the output is computed as
|
||||
``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
|
||||
`MLP(x)` in `LN(MLP(x + LN(Attention(x))))`
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
@ -235,7 +235,7 @@ class Olmo2MLP(nn.Module):
|
||||
class Olmo2DecoderLayer(nn.Module):
|
||||
"""
|
||||
This is a typical transformer block where the output is
|
||||
computed as ``MLP(LN(x + Attention(LN(x))))``
|
||||
computed as `MLP(LN(x + Attention(LN(x))))`
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
|
@ -166,7 +166,7 @@ class VisualTokenizer(torch.nn.Module):
|
||||
# e.g., for hidden_stride=2, this leads to a token length reduction:
|
||||
# 1024 -> 256 for aimv2
|
||||
if self.config.hidden_stride > 1:
|
||||
# this `d` maybe different from the above `d``
|
||||
# this `d` maybe different from the above `d`
|
||||
n, L, d = features.shape
|
||||
sqrt_l = int(L**0.5)
|
||||
assert sqrt_l**2 == L, (
|
||||
|
@ -99,13 +99,13 @@ class AutoWeightsLoader:
|
||||
the weights only once.
|
||||
|
||||
The weight loading logic for individual modules can be overridden
|
||||
by defining a ``load_weights`` method.
|
||||
by defining a `load_weights` method.
|
||||
|
||||
Similarly, the weight loading logic for individual parameters can be
|
||||
overridden by defining a ``weight_loader`` method.
|
||||
overridden by defining a `weight_loader` method.
|
||||
|
||||
Detailed weight loading information can be viewed by setting the
|
||||
environment variable ``VLLM_LOGGING_LEVEL=DEBUG``.
|
||||
environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
|
||||
"""
|
||||
|
||||
# Models trained using early version ColossalAI
|
||||
@ -372,9 +372,9 @@ def flatten_bn(
|
||||
concat: bool = False,
|
||||
) -> list[torch.Tensor] | torch.Tensor:
|
||||
"""
|
||||
Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
|
||||
Flatten the `B` and `N` dimensions of batched multimodal inputs.
|
||||
|
||||
The input tensor should have shape ``(B, N, ...)```.
|
||||
The input tensor should have shape `(B, N, ...)`.
|
||||
"""
|
||||
if isinstance(x, torch.Tensor):
|
||||
return x.flatten(0, 1)
|
||||
@ -424,12 +424,12 @@ def _merge_multimodal_embeddings(
|
||||
is_multimodal: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
|
||||
positions in ``inputs_embeds`` corresponding to placeholder tokens in
|
||||
``input_ids``.
|
||||
Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
|
||||
positions in `inputs_embeds` corresponding to placeholder tokens in
|
||||
`input_ids`.
|
||||
|
||||
Note:
|
||||
This updates ``inputs_embeds`` in place.
|
||||
This updates `inputs_embeds` in place.
|
||||
"""
|
||||
if len(multimodal_embeddings) == 0:
|
||||
return inputs_embeds
|
||||
@ -475,14 +475,14 @@ def merge_multimodal_embeddings(
|
||||
placeholder_token_id: int | list[int],
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
|
||||
positions in ``inputs_embeds`` corresponding to placeholder tokens in
|
||||
``input_ids``.
|
||||
Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
|
||||
positions in `inputs_embeds` corresponding to placeholder tokens in
|
||||
`input_ids`.
|
||||
|
||||
``placeholder_token_id`` can be a list of token ids (e.g, token ids
|
||||
`placeholder_token_id` can be a list of token ids (e.g, token ids
|
||||
of img_start, img_break, and img_end tokens) when needed: This means
|
||||
the order of these tokens in the ``input_ids`` MUST MATCH the order of
|
||||
their embeddings in ``multimodal_embeddings`` since we need to
|
||||
the order of these tokens in the `input_ids` MUST MATCH the order of
|
||||
their embeddings in `multimodal_embeddings` since we need to
|
||||
slice-merge instead of individually scattering.
|
||||
|
||||
For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
|
||||
@ -497,7 +497,7 @@ def merge_multimodal_embeddings(
|
||||
input_ids for a correct embedding merge.
|
||||
|
||||
Note:
|
||||
This updates ``inputs_embeds`` in place.
|
||||
This updates `inputs_embeds` in place.
|
||||
"""
|
||||
if isinstance(placeholder_token_id, list):
|
||||
is_multimodal = isin_list(input_ids, placeholder_token_id)
|
||||
|
@ -70,7 +70,7 @@ class BasevLLMParameter(Parameter):
|
||||
# NOTE(@ksayers) some models such as mamba_mixer2 override the
|
||||
# weight loader to support custom loading. In the future, model-specific
|
||||
# weight loading should be implemented via Model.load_weights. In the
|
||||
# meantime, support deleting and overriding `weight_loader`` attribute
|
||||
# meantime, support deleting and overriding `weight_loader` attribute
|
||||
if self._weight_loader is None:
|
||||
raise AttributeError(
|
||||
f"{self.__class__.__name__} weight_loader attribute has been deleted"
|
||||
|
@ -332,8 +332,8 @@ class PromptInsertion(PromptUpdate):
|
||||
|
||||
Example:
|
||||
|
||||
For each image, insert a number of ``<image>`` feature placeholders
|
||||
equal to the feature size of the vision encoder after the ``<s>`` token:
|
||||
For each image, insert a number of `<image>` feature placeholders
|
||||
equal to the feature size of the vision encoder after the `<s>` token:
|
||||
|
||||
```python
|
||||
PromptInsertion(
|
||||
@ -353,7 +353,7 @@ class PromptInsertion(PromptUpdate):
|
||||
)
|
||||
```
|
||||
|
||||
Insert these tokens after a prefix ``Images:``:
|
||||
Insert these tokens after a prefix `Images:`:
|
||||
|
||||
```python
|
||||
PromptInsertion(
|
||||
@ -401,8 +401,8 @@ class PromptReplacement(PromptUpdate):
|
||||
|
||||
Example:
|
||||
|
||||
For each image, replace one ``<image>`` input placeholder in the prompt
|
||||
with a number of ``<image>`` feature placeholders
|
||||
For each image, replace one `<image>` input placeholder in the prompt
|
||||
with a number of `<image>` feature placeholders
|
||||
equal to the feature size of the vision encoder:
|
||||
|
||||
```python
|
||||
@ -413,8 +413,8 @@ class PromptReplacement(PromptUpdate):
|
||||
)
|
||||
```
|
||||
|
||||
As above, but further pad the feature placeholders with ``<image_bos>``
|
||||
and `<image_eos>``, which are not supposed to be passed to the vision
|
||||
As above, but further pad the feature placeholders with `<image_bos>`
|
||||
and `<image_eos>`, which are not supposed to be passed to the vision
|
||||
encoder:
|
||||
|
||||
```python
|
||||
|
@ -307,7 +307,7 @@ class MultiModalRegistry:
|
||||
"""
|
||||
Create dummy data for profiling the memory usage of a model.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
The model is identified by `model_config`.
|
||||
"""
|
||||
processor = self.create_processor(model_config, cache=cache)
|
||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||
@ -340,7 +340,7 @@ class MultiModalRegistry:
|
||||
"""
|
||||
Create dummy data for profiling the memory usage of a model.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
The model is identified by `model_config`.
|
||||
"""
|
||||
processor = self.create_processor(model_config, cache=cache)
|
||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||
|
@ -75,7 +75,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
|
||||
"0x74bd": "AMD_Instinct_MI300X_HF",
|
||||
}
|
||||
|
||||
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
|
||||
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
|
||||
if "HIP_VISIBLE_DEVICES" in os.environ:
|
||||
val = os.environ["HIP_VISIBLE_DEVICES"]
|
||||
if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
|
||||
|
@ -168,7 +168,7 @@ class XPUPlatform(Platform):
|
||||
parallel_config.distributed_executor_backend = "uni"
|
||||
elif parallel_config.distributed_executor_backend == "mp":
|
||||
# FIXME(kunshang):
|
||||
# spawn needs calling `if __name__ == '__main__':``
|
||||
# spawn needs calling `if __name__ == '__main__':`
|
||||
# fork is not supported for xpu start new process.
|
||||
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
@ -306,10 +306,10 @@ class SamplingParams(
|
||||
)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# how we deal with `best_of``:
|
||||
# if `best_of`` is not set, we default to `n`;
|
||||
# if `best_of`` is set, we set `n`` to `best_of`,
|
||||
# and set `_real_n`` to the original `n`.
|
||||
# how we deal with `best_of`:
|
||||
# if `best_of` is not set, we default to `n`;
|
||||
# if `best_of` is set, we set `n` to `best_of`,
|
||||
# and set `_real_n` to the original `n`.
|
||||
# when we return the result, we will check
|
||||
# if we need to return `n` or `_real_n` results
|
||||
if self.best_of:
|
||||
|
@ -21,7 +21,7 @@ from vllm.utils import cdiv, has_deep_gemm
|
||||
|
||||
@functools.cache
|
||||
def is_deep_gemm_supported() -> bool:
|
||||
"""Return ``True`` if DeepGEMM is supported on the current platform.
|
||||
"""Return `True` if DeepGEMM is supported on the current platform.
|
||||
Currently, only Hopper and Blackwell GPUs are supported.
|
||||
"""
|
||||
is_supported_arch = current_platform.is_cuda() and (
|
||||
@ -33,7 +33,7 @@ def is_deep_gemm_supported() -> bool:
|
||||
|
||||
@functools.cache
|
||||
def is_deep_gemm_e8m0_used() -> bool:
|
||||
"""Return ``True`` if vLLM is configured to use DeepGEMM "
|
||||
"""Return `True` if vLLM is configured to use DeepGEMM "
|
||||
"E8M0 scale on a Hopper or Blackwell-class GPU.
|
||||
"""
|
||||
if not is_deep_gemm_supported():
|
||||
@ -311,9 +311,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
|
||||
"""Return a global difference metric for unit tests.
|
||||
|
||||
DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
|
||||
error, causing ``torch.testing.assert_close`` to fail. Instead of checking
|
||||
error, causing `torch.testing.assert_close` to fail. Instead of checking
|
||||
every element, we compute a cosine-style similarity over the whole tensor
|
||||
and report ``1 - sim``. Once kernel accuracy improves this helper can be
|
||||
and report `1 - sim`. Once kernel accuracy improves this helper can be
|
||||
removed.
|
||||
"""
|
||||
|
||||
|
@ -34,7 +34,7 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
|
||||
|
||||
@functools.cache
|
||||
def has_flashinfer() -> bool:
|
||||
"""Return ``True`` if FlashInfer is available."""
|
||||
"""Return `True` if FlashInfer is available."""
|
||||
# Use find_spec to check if the module exists without importing it
|
||||
# This avoids potential CUDA initialization side effects
|
||||
if importlib.util.find_spec("flashinfer") is None:
|
||||
@ -114,13 +114,13 @@ autotune = _lazy_import_wrapper(
|
||||
|
||||
@functools.cache
|
||||
def has_flashinfer_comm() -> bool:
|
||||
"""Return ``True`` if FlashInfer comm module is available."""
|
||||
"""Return `True` if FlashInfer comm module is available."""
|
||||
return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None
|
||||
|
||||
|
||||
@functools.cache
|
||||
def has_flashinfer_all2all() -> bool:
|
||||
"""Return ``True`` if FlashInfer mnnvl all2all is available."""
|
||||
"""Return `True` if FlashInfer mnnvl all2all is available."""
|
||||
if not has_flashinfer_comm():
|
||||
return False
|
||||
|
||||
@ -141,7 +141,7 @@ def has_flashinfer_all2all() -> bool:
|
||||
|
||||
@functools.cache
|
||||
def has_flashinfer_moe() -> bool:
|
||||
"""Return ``True`` if FlashInfer MoE module is available."""
|
||||
"""Return `True` if FlashInfer MoE module is available."""
|
||||
return (
|
||||
has_flashinfer()
|
||||
and importlib.util.find_spec("flashinfer.fused_moe") is not None
|
||||
@ -150,7 +150,7 @@ def has_flashinfer_moe() -> bool:
|
||||
|
||||
@functools.cache
|
||||
def has_flashinfer_cutlass_fused_moe() -> bool:
|
||||
"""Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
|
||||
"""Return `True` if FlashInfer CUTLASS fused MoE is available."""
|
||||
if not has_flashinfer_moe():
|
||||
return False
|
||||
|
||||
@ -171,7 +171,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
|
||||
|
||||
@functools.cache
|
||||
def has_nvidia_artifactory() -> bool:
|
||||
"""Return ``True`` if NVIDIA's artifactory is accessible.
|
||||
"""Return `True` if NVIDIA's artifactory is accessible.
|
||||
|
||||
This checks connectivity to the kernel inference library artifactory
|
||||
which is required for downloading certain cubin kernels like TRTLLM FHMA.
|
||||
@ -218,9 +218,9 @@ def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
|
||||
|
||||
def force_use_trtllm_attention() -> bool | None:
|
||||
"""
|
||||
Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set,
|
||||
return ``True`` if TRTLLM attention is forced to be used,
|
||||
return ``False`` if TRTLLM attention is forced to be not used.
|
||||
Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
|
||||
return `True` if TRTLLM attention is forced to be used,
|
||||
return `False` if TRTLLM attention is forced to be not used.
|
||||
"""
|
||||
return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
|
||||
|
||||
@ -244,7 +244,7 @@ def use_trtllm_attention(
|
||||
has_sinks: bool = False,
|
||||
has_spec: bool = False,
|
||||
) -> bool:
|
||||
"""Return ``True`` if TRTLLM attention is used."""
|
||||
"""Return `True` if TRTLLM attention is used."""
|
||||
force_use_trtllm = force_use_trtllm_attention()
|
||||
|
||||
# Environment variable is set to 0 - respect it
|
||||
|
@ -26,17 +26,17 @@ from vllm.v1.kv_cache_interface import (
|
||||
from vllm.v1.request import Request
|
||||
|
||||
# BlockHash represents the hash of a single KV-cache block used for
|
||||
# prefix caching. Treating it as a distinct type from ``bytes`` helps
|
||||
# prefix caching. Treating it as a distinct type from `bytes` helps
|
||||
# catch accidental misuse when passing around raw byte strings.
|
||||
BlockHash = NewType("BlockHash", bytes)
|
||||
|
||||
# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID.
|
||||
# `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID.
|
||||
# It is represented as raw bytes for compactness and efficiency. The helper
|
||||
# functions below pack/unpack the ``BlockHash`` and group id into/from the key.
|
||||
# functions below pack/unpack the `BlockHash` and group id into/from the key.
|
||||
BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
|
||||
|
||||
# ExternalBlockHash is used for reproducible prefix-cache block hashing.
|
||||
# It's a union of ``bytes`` and ``int`` to keep backward compatibility
|
||||
# It's a union of `bytes` and `int` to keep backward compatibility
|
||||
# after we default block hashing to use sha256 bytes.
|
||||
ExternalBlockHash: TypeAlias = bytes | int
|
||||
|
||||
@ -44,7 +44,7 @@ ExternalBlockHash: TypeAlias = bytes | int
|
||||
def make_block_hash_with_group_id(
|
||||
block_hash: BlockHash, group_id: int
|
||||
) -> BlockHashWithGroupId:
|
||||
"""Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``.
|
||||
"""Pack a `BlockHash` and group id into a `BlockHashWithGroupId`.
|
||||
|
||||
The group id is encoded using 4 bytes in big-endian order and appended to
|
||||
the block hash bytes. This representation avoids creating tuples while
|
||||
@ -54,12 +54,12 @@ def make_block_hash_with_group_id(
|
||||
|
||||
|
||||
def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
|
||||
"""Extract the ``BlockHash`` from a ``BlockHashWithGroupId``."""
|
||||
"""Extract the `BlockHash` from a `BlockHashWithGroupId`."""
|
||||
return BlockHash(key[:-4])
|
||||
|
||||
|
||||
def get_group_id(key: BlockHashWithGroupId) -> int:
|
||||
"""Extract the group id from a ``BlockHashWithGroupId``."""
|
||||
"""Extract the group id from a `BlockHashWithGroupId`."""
|
||||
return int.from_bytes(key[-4:], "big", signed=False)
|
||||
|
||||
|
||||
|
@ -128,7 +128,7 @@ class CPUWorker(Worker):
|
||||
"Please try to bind threads manually."
|
||||
)
|
||||
|
||||
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]``
|
||||
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
|
||||
selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore
|
||||
logical_cpu_list = [
|
||||
x for x in logical_cpu_list if x.numa_node == selected_numa_node
|
||||
|
@ -182,8 +182,8 @@ class TPUWorker:
|
||||
if isinstance(layer_spec, AttentionSpec):
|
||||
dtype = layer_spec.dtype
|
||||
|
||||
# Use an empty tensor instead of `None`` to force Dynamo to pass
|
||||
# it by reference, rather by specializing on the value ``None``.
|
||||
# Use an empty tensor instead of `None` to force Dynamo to pass
|
||||
# it by reference, rather by specializing on the value `None`.
|
||||
tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
|
||||
kv_caches[layer_name] = tpu_kv_cache
|
||||
else:
|
||||
|
Reference in New Issue
Block a user