mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Multimodal] Always enable hashing mm data (#23308)
Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -1685,15 +1685,6 @@ class ModelConfig:
|
||||
def is_multimodal_model(self) -> bool:
|
||||
return self.multimodal_config is not None
|
||||
|
||||
@property
|
||||
def processor_return_mm_hashes(self) -> bool:
|
||||
"""Whether the multi-modal processor should output hashes."""
|
||||
mm_config = self.multimodal_config
|
||||
if mm_config is None:
|
||||
return False
|
||||
|
||||
return mm_config.mm_processor_cache_gb > 0
|
||||
|
||||
@property
|
||||
def enable_mm_processor_cache(self) -> bool:
|
||||
"""Whether the multi-modal processor cache should be enabled."""
|
||||
|
@ -254,7 +254,6 @@ class InputPreprocessor:
|
||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Apply the model's multi-modal processor to a multi-modal prompt,
|
||||
@ -271,8 +270,7 @@ class InputPreprocessor:
|
||||
return mm_processor.apply(prompt,
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes)
|
||||
tokenization_kwargs=tokenization_kwargs)
|
||||
|
||||
async def _process_multimodal_async(
|
||||
self,
|
||||
@ -281,7 +279,6 @@ class InputPreprocessor:
|
||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Async version of
|
||||
@ -297,8 +294,7 @@ class InputPreprocessor:
|
||||
return mm_processor.apply(prompt,
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes)
|
||||
tokenization_kwargs=tokenization_kwargs)
|
||||
|
||||
def _process_embeds(
|
||||
self,
|
||||
@ -335,7 +331,6 @@ class InputPreprocessor:
|
||||
parsed_content: TokensPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> Union[TokenInputs, MultiModalInputs]:
|
||||
prompt_token_ids = parsed_content["prompt_token_ids"]
|
||||
token_type_ids = parsed_content.get("token_type_ids")
|
||||
@ -348,7 +343,6 @@ class InputPreprocessor:
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
else:
|
||||
inputs = token_inputs(
|
||||
@ -366,7 +360,6 @@ class InputPreprocessor:
|
||||
parsed_content: TokensPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> Union[TokenInputs, MultiModalInputs]:
|
||||
prompt_token_ids = parsed_content["prompt_token_ids"]
|
||||
token_type_ids = parsed_content.get("token_type_ids")
|
||||
@ -379,7 +372,6 @@ class InputPreprocessor:
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
else:
|
||||
inputs = token_inputs(
|
||||
@ -397,7 +389,6 @@ class InputPreprocessor:
|
||||
parsed_content: TextPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> Union[TokenInputs, MultiModalInputs]:
|
||||
prompt_text = parsed_content["prompt"]
|
||||
|
||||
@ -409,7 +400,6 @@ class InputPreprocessor:
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
else:
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
@ -432,7 +422,6 @@ class InputPreprocessor:
|
||||
parsed_content: TextPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> Union[TokenInputs, MultiModalInputs]:
|
||||
prompt_text = parsed_content["prompt"]
|
||||
|
||||
@ -444,7 +433,6 @@ class InputPreprocessor:
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
else:
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
@ -467,7 +455,6 @@ class InputPreprocessor:
|
||||
prompt: SingletonPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
"""
|
||||
Extract the singleton inputs from a prompt.
|
||||
@ -476,7 +463,6 @@ class InputPreprocessor:
|
||||
|
||||
* prompt: single encoder or decoder input prompt
|
||||
* lora_request: this is only valid for decoder prompts
|
||||
* return_mm_hashes: whether to return multimodal hashes
|
||||
|
||||
Returns:
|
||||
|
||||
@ -490,21 +476,18 @@ class InputPreprocessor:
|
||||
return self._process_tokens(
|
||||
parsed["content"],
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
if parsed["type"] == "text":
|
||||
return self._process_text(
|
||||
parsed["content"],
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
if parsed["type"] == "str":
|
||||
return self._process_text(
|
||||
TextPrompt(prompt=parsed["content"]),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
assert_never(parsed)
|
||||
@ -514,7 +497,6 @@ class InputPreprocessor:
|
||||
prompt: SingletonPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
"""
|
||||
Async version of
|
||||
@ -528,21 +510,18 @@ class InputPreprocessor:
|
||||
return await self._process_tokens_async(
|
||||
parsed["content"],
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
if parsed["type"] == "text":
|
||||
return await self._process_text_async(
|
||||
parsed["content"],
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
if parsed["type"] == "str":
|
||||
return await self._process_text_async(
|
||||
TextPrompt(prompt=parsed["content"]),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
assert_never(parsed)
|
||||
@ -785,7 +764,6 @@ class InputPreprocessor:
|
||||
prompt: SingletonPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> DecoderOnlyInputs:
|
||||
"""
|
||||
For decoder-only models:
|
||||
@ -796,7 +774,6 @@ class InputPreprocessor:
|
||||
|
||||
* prompt: input prompt
|
||||
* lora_request
|
||||
* return_mm_hashes
|
||||
|
||||
Returns:
|
||||
|
||||
@ -807,7 +784,6 @@ class InputPreprocessor:
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(prompt_comps)
|
||||
@ -817,7 +793,6 @@ class InputPreprocessor:
|
||||
prompt: SingletonPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> DecoderOnlyInputs:
|
||||
"""
|
||||
Async version of
|
||||
@ -827,7 +802,6 @@ class InputPreprocessor:
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(prompt_comps)
|
||||
@ -837,17 +811,15 @@ class InputPreprocessor:
|
||||
prompt: PromptType,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> ProcessorInputs:
|
||||
"""Preprocess the input prompt."""
|
||||
if self.model_config.is_encoder_decoder:
|
||||
assert not return_mm_hashes, (
|
||||
"Multimodal hashes for encoder-decoder models should not be ",
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
# input prompts to encoder & decoder.
|
||||
return self._process_encoder_decoder_prompt(
|
||||
prompt, tokenization_kwargs)
|
||||
prompt,
|
||||
tokenization_kwargs,
|
||||
)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
@ -858,7 +830,6 @@ class InputPreprocessor:
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
async def preprocess_async(
|
||||
@ -866,19 +837,18 @@ class InputPreprocessor:
|
||||
prompt: PromptType,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> ProcessorInputs:
|
||||
"""
|
||||
Async version of
|
||||
[`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
|
||||
"""
|
||||
if self.model_config.is_encoder_decoder:
|
||||
assert not return_mm_hashes, (
|
||||
"Multimodal hashes for encoder-decoder models should not be ",
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return await self._process_encoder_decoder_prompt_async(prompt)
|
||||
# input prompts to encoder & decoder.
|
||||
return await self._process_encoder_decoder_prompt_async(
|
||||
prompt,
|
||||
tokenization_kwargs,
|
||||
)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
@ -889,5 +859,4 @@ class InputPreprocessor:
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
@ -290,8 +290,6 @@ class DeepseekVL2MultiModalProcessor(
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
# The processor logic is different for len(images) <= 2 vs > 2
|
||||
# Since the processing cache assumes that the processor output is
|
||||
@ -303,7 +301,6 @@ class DeepseekVL2MultiModalProcessor(
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return super()._cached_apply_hf_processor(
|
||||
@ -311,7 +308,6 @@ class DeepseekVL2MultiModalProcessor(
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
|
||||
|
@ -479,8 +479,6 @@ class H2OVLMultiModalProcessor(
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
# The processor logic is different for len(images) <= 1 vs > 1
|
||||
# Since the processing cache assumes that the processor output is
|
||||
@ -492,7 +490,6 @@ class H2OVLMultiModalProcessor(
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return super()._cached_apply_hf_processor(
|
||||
@ -500,7 +497,6 @@ class H2OVLMultiModalProcessor(
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
|
||||
|
@ -795,7 +795,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
@ -807,7 +806,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
)
|
||||
|
||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs, return_mm_hashes)
|
||||
tokenization_kwargs)
|
||||
|
||||
mm_items = self._to_mm_items(mm_data)
|
||||
mm_item_counts = mm_items.get_all_counts()
|
||||
|
@ -168,10 +168,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalEncDecInputs:
|
||||
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs, return_mm_hashes)
|
||||
tokenization_kwargs)
|
||||
|
||||
image_token_id = self.info.get_hf_config().image_token_index
|
||||
# Check that the number of image tokens in the decoder prompt matches
|
||||
|
@ -194,10 +194,9 @@ class PaliGemmaMultiModalProcessor(
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs, return_mm_hashes)
|
||||
tokenization_kwargs)
|
||||
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
@ -308,15 +308,12 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
# NOTE: The tokens are already inserted by the chat template
|
||||
|
@ -18,7 +18,7 @@
|
||||
"""Inference-only IBM/NASA Prithvi Geospatial model."""
|
||||
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Optional, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -32,18 +32,56 @@ from vllm.model_executor.models.interfaces import (
|
||||
default_pooling_type)
|
||||
from vllm.model_executor.models.utils import AutoWeightsLoader
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalFieldElem, MultiModalInputs,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalSharedField, PlaceholderRange)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.inputs import (ImageItem, ModalityData,
|
||||
MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargsItems,
|
||||
PlaceholderRange)
|
||||
from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
|
||||
MultiModalDataItems, MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptUpdate)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
|
||||
def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
|
||||
# This model receives in input a multi-dimensional tensor representing
|
||||
# a single image patch and therefore it is not to be split
|
||||
# into multiple elements, but rather to be considered a single one.
|
||||
# Hence, the decision of using a MultiModalSharedField.
|
||||
# The expected shape is (num_channels, width, height).
|
||||
|
||||
# This model however allows the user to also submit multiple image
|
||||
# patches as a batch, adding a further dimension to the above shape.
|
||||
# At this stage we only support submitting one patch per request and
|
||||
# batching is achieved via vLLM batching.
|
||||
# TODO (christian-pinto): enable support for multi patch requests
|
||||
# in tandem with vLLM batching.
|
||||
return dict(
|
||||
pixel_values=MultiModalFieldConfig.shared(batch_size=1,
|
||||
modality="image"),
|
||||
location_coords=MultiModalFieldConfig.shared(batch_size=1,
|
||||
modality="image"),
|
||||
)
|
||||
|
||||
|
||||
class PrithviGeoSpatialMAEMultiModalDataParser(MultiModalDataParser):
|
||||
|
||||
def _parse_image_data(
|
||||
self,
|
||||
data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
|
||||
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||
if isinstance(data, dict):
|
||||
return DictEmbeddingItems(
|
||||
data,
|
||||
modality="image",
|
||||
required_fields={"pixel_values", "location_coords"},
|
||||
fields_factory=_prithvi_field_config,
|
||||
)
|
||||
|
||||
return super()._parse_image_data(data)
|
||||
|
||||
|
||||
class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
@ -64,26 +102,26 @@ class PrithviGeoSpatialMAEInputBuilder(
|
||||
# This model input is fixed and is in the form of a torch Tensor.
|
||||
# The size of pixel_values might change in the cases where we resize
|
||||
# the input but never exceeds the dimensions below.
|
||||
return {
|
||||
image_data = {
|
||||
"pixel_values": torch.full((6, 512, 512), 1.0,
|
||||
dtype=torch.float16),
|
||||
"location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
|
||||
}
|
||||
|
||||
return {"image": image_data}
|
||||
|
||||
|
||||
class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
return PrithviGeoSpatialMAEMultiModalDataParser()
|
||||
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(
|
||||
pixel_values=MultiModalFieldConfig.shared(batch_size=1,
|
||||
modality="image"),
|
||||
location_coords=MultiModalFieldConfig.shared(batch_size=1,
|
||||
modality="image"),
|
||||
)
|
||||
return _prithvi_field_config(hf_inputs)
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
@ -99,46 +137,32 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
mm_kwargs = {}
|
||||
if "image" in mm_data:
|
||||
image_data = mm_data["image"]
|
||||
else:
|
||||
image_data = mm_data
|
||||
mm_data = {"image": mm_data}
|
||||
|
||||
for k, v in mm_data.items():
|
||||
if isinstance(v, dict) and k == "image":
|
||||
mm_kwargs.update(v)
|
||||
else:
|
||||
mm_kwargs[k] = v
|
||||
mm_items = self._to_mm_items(mm_data)
|
||||
mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs or {})
|
||||
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
|
||||
|
||||
# This model receives in input a multi-dimensional tensor representing
|
||||
# a single image patch and therefore it is not to be split
|
||||
# into multiple elements, but rather to be considered a single one.
|
||||
# Hence, the decision of using a MultiModalSharedField.
|
||||
# The expected shape is (num_channels, width, height).
|
||||
mm_processed_data = BatchFeature(image_data)
|
||||
|
||||
# This model however allows the user to also submit multiple image
|
||||
# patches as a batch, adding a further dimension to the above shape.
|
||||
# At this stage we only support submitting one patch per request and
|
||||
# batching is achieved via vLLM batching.
|
||||
# TODO (christian-pinto): enable support for multi patch requests
|
||||
# in tandem with vLLM batching.
|
||||
multimodal_kwargs_items = [
|
||||
MultiModalKwargsItem.from_elems([
|
||||
MultiModalFieldElem(
|
||||
modality="image",
|
||||
key=key,
|
||||
data=data,
|
||||
field=MultiModalSharedField(1),
|
||||
) for key, data in mm_kwargs.items()
|
||||
])
|
||||
]
|
||||
mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
|
||||
mm_processed_data,
|
||||
self._get_mm_fields_config(mm_processed_data,
|
||||
hf_processor_mm_kwargs),
|
||||
)
|
||||
|
||||
return MultiModalInputs(
|
||||
type="multimodal",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=[1],
|
||||
mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items),
|
||||
mm_hashes=None,
|
||||
mm_kwargs=mm_kwargs,
|
||||
mm_hashes=mm_hashes,
|
||||
mm_placeholders=mm_placeholders,
|
||||
)
|
||||
|
||||
|
@ -310,7 +310,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Process multi-modal inputs to be used in vLLM.
|
||||
|
@ -288,15 +288,12 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
# NOTE: The tokens are already inserted by the chat template
|
||||
|
@ -43,7 +43,7 @@ class MultiModalHasher:
|
||||
return cls.item_to_bytes(
|
||||
"image", np.asarray(convert_image_mode(obj, "RGBA")))
|
||||
if isinstance(obj, torch.Tensor):
|
||||
return cls.item_to_bytes("tensor", obj.numpy())
|
||||
return cls.item_to_bytes("tensor", obj.cpu().numpy())
|
||||
if isinstance(obj, np.ndarray):
|
||||
# If the array is non-contiguous, we need to copy it first
|
||||
arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
|
||||
|
@ -901,7 +901,7 @@ class MultiModalInputs(TypedDict):
|
||||
mm_kwargs: MultiModalKwargsItems
|
||||
"""Keyword arguments to be directly passed to the model after batching."""
|
||||
|
||||
mm_hashes: Optional["MultiModalHashDict"]
|
||||
mm_hashes: "MultiModalHashDict"
|
||||
"""The hashes of the multi-modal data."""
|
||||
|
||||
mm_placeholders: "MultiModalPlaceholderDict"
|
||||
|
@ -998,7 +998,7 @@ A collection of prompt updates with a similar structure as
|
||||
|
||||
class MultiModalProcessingInfo(NamedTuple):
|
||||
kwargs: MultiModalKwargsItems
|
||||
hashes: Optional[MultiModalHashes]
|
||||
hashes: MultiModalHashes
|
||||
prompt_updates: MultiModalPromptUpdates
|
||||
|
||||
|
||||
@ -1399,8 +1399,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
(
|
||||
prompt_ids,
|
||||
@ -1420,9 +1418,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
hf_processor_mm_kwargs),
|
||||
)
|
||||
|
||||
mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs)
|
||||
if return_mm_hashes else None)
|
||||
mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs)
|
||||
|
||||
unbound_prompt_updates = self._get_prompt_updates(
|
||||
mm_data_items,
|
||||
@ -1446,8 +1443,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
"""
|
||||
Apply the HF processor on the full prompt text,
|
||||
@ -1462,7 +1457,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
||||
@ -1476,8 +1470,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_hashes=mm_hashes,
|
||||
)
|
||||
|
||||
mm_hashes_to_return = mm_hashes if return_mm_hashes else None
|
||||
|
||||
# NOTE: `prompt` does not correspond to `mm_missing_data_items`,
|
||||
# so we can't apply prompt updates until the new multimodal
|
||||
# items are combined with the cached multimodal items
|
||||
@ -1515,7 +1507,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
|
||||
mm_info = MultiModalProcessingInfo(
|
||||
kwargs=mm_kwargs,
|
||||
hashes=mm_hashes_to_return,
|
||||
hashes=mm_hashes,
|
||||
prompt_updates=mm_prompt_updates,
|
||||
)
|
||||
|
||||
@ -1697,7 +1689,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Process multi-modal inputs to be used in vLLM.
|
||||
@ -1726,7 +1717,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_items,
|
||||
hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
# NOTE: tokenization_kwargs are not required to init processor
|
||||
@ -1811,7 +1801,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalEncDecInputs:
|
||||
"""
|
||||
Process multi-modal inputs to be used in vLLM.
|
||||
@ -1826,7 +1815,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs,
|
||||
tokenization_kwargs,
|
||||
return_mm_hashes,
|
||||
)
|
||||
|
||||
return self._get_enc_dec_inputs(
|
||||
|
@ -17,7 +17,6 @@ from vllm.multimodal.utils import argsort_mm_positions
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
from vllm.utils import is_list_of
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
|
||||
from vllm.v1.structured_output.backend_guidance import (
|
||||
@ -253,13 +252,10 @@ class Processor:
|
||||
# 1. Tokenize text prompt, with LoRA request if one exists.
|
||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||
# multimodal data and expand prompt token ids accordingly.
|
||||
return_mm_hashes = (self.model_config.processor_return_mm_hashes
|
||||
or bool(self.cache_config.enable_prefix_caching))
|
||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.validate_request(
|
||||
@ -302,7 +298,7 @@ class Processor:
|
||||
if decoder_inputs["type"] == "multimodal":
|
||||
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
||||
decoder_mm_positions = decoder_inputs["mm_placeholders"]
|
||||
decoder_mm_hashes = decoder_inputs.get("mm_hashes")
|
||||
decoder_mm_hashes = decoder_inputs["mm_hashes"]
|
||||
|
||||
# Merge and flatten multimodal placeholders, hashes and inputs
|
||||
# from dictionaries to lists, and sort them by each item's position
|
||||
@ -317,19 +313,15 @@ class Processor:
|
||||
decoder_mm_positions[modality][idx]
|
||||
for modality, idx in sorted_mm_idxs
|
||||
]
|
||||
sorted_mm_hashes = None if decoder_mm_hashes is None else [
|
||||
sorted_mm_hashes = [
|
||||
decoder_mm_hashes[modality][idx]
|
||||
for modality, idx in sorted_mm_idxs
|
||||
]
|
||||
|
||||
if sorted_mm_hashes is not None:
|
||||
sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
|
||||
orig_sorted_mm_inputs,
|
||||
sorted_mm_hashes,
|
||||
)
|
||||
else:
|
||||
assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem)
|
||||
sorted_mm_inputs = orig_sorted_mm_inputs
|
||||
sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
|
||||
orig_sorted_mm_inputs,
|
||||
sorted_mm_hashes,
|
||||
)
|
||||
|
||||
return decoder_inputs.get("prompt"), EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
|
Reference in New Issue
Block a user