[FEATURE]: Use pydantic validation in multimodal.py config (#26629)

Signed-off-by: Anand Roy <86306690+andycandy@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Anand Roy
2025-10-13 20:26:59 +05:30
committed by GitHub
parent 4a61950f4d
commit 10214b6935

View File

@ -3,10 +3,9 @@
import hashlib
from collections.abc import Mapping
from dataclasses import field
from typing import Any, Literal, TypeAlias
from pydantic import ConfigDict, Field, field_validator
from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
@ -55,7 +54,7 @@ DummyOptions: TypeAlias = (
class MultiModalConfig:
"""Controls the behavior of multimodal models."""
limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict)
limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
"""The maximum number of input items and options allowed per
prompt for each modality.
Defaults to 999 for each modality.
@ -71,7 +70,7 @@ class MultiModalConfig:
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
"height": 512}}
"""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
@ -84,7 +83,7 @@ class MultiModalConfig:
For example, for Phi-3-Vision:
`{"num_crops": 4}`."""
mm_processor_cache_gb: float = 4
mm_processor_cache_gb: float = Field(default=4, ge=0)
"""The size (in GiB) of the multi-modal processor cache, which is used to
avoid re-processing past multi-modal inputs.
@ -96,7 +95,7 @@ class MultiModalConfig:
mm_processor_cache_type: MMCacheType = "lru"
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
mm_shm_cache_max_object_size_mb: int = 128
mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
"""Size limit (in MiB) for each object stored in the multi-modal processor
shared memory cache. Only effective when `mm_processor_cache_type` is
`"shm"`."""
@ -123,7 +122,7 @@ class MultiModalConfig:
This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and
embedding cache."""
video_pruning_rate: float | None = None
video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
"""Sets pruning rate for video pruning via Efficient Video Sampling.
Value sits in range [0;1) and determines fraction of media tokens
from each video to be pruned.
@ -149,6 +148,18 @@ class MultiModalConfig:
value[k] = BaseDummyOptions(**v)
return value
@model_validator(mode="after")
def _validate_multimodal_config(self):
if self.mm_processor_cache_type != "shm" and (
self.mm_shm_cache_max_object_size_mb
!= MultiModalConfig.mm_shm_cache_max_object_size_mb
):
raise ValueError(
"'mm_shm_cache_max_object_size_mb' should only be set when "
"'mm_processor_cache_type' is 'shm'."
)
return self
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,