Files
vllm-dev/vllm/model_executor/models/phi4mm.py
2025-08-27 14:19:13 +08:00

1259 lines
49 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Literal, Optional, TypedDict, Union
import numpy as np
import torch
import torch.nn as nn
from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
SequenceFeatureExtractor, SiglipVisionConfig)
from vllm.config import VllmConfig
from vllm.distributed import get_pp_group
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
from vllm.model_executor.models.llama import LlamaModel
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
ImageProcessorItems, ImageSize,
MultiModalDataItems, MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement,
PromptUpdate, ResolvedPromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of
from .idefics2_vision_model import Idefics2VisionTransformer
from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
from .phi4mm_audio import AudioEmbedding
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
merge_multimodal_embeddings)
# <|endoftext10|> (see vocab.json in hf model)
_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
# <|endoftext11|>
_AUDIO_PLACEHOLDER_TOKEN_ID = 200011
_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
SIGLIP_NAME = "siglip-so400m-patch14-448"
VISION_ENCODER_TO_PROCESSING_CONFIG = {
'siglip-so400m-patch14-448': {
'vit_image_size': 448,
'vit_patch_size': 14,
'token_compression_factor': 2,
},
}
def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
target_width: int):
ratio_width = target_width / orig_width
ratio_height = target_height / orig_height
if ratio_width < ratio_height:
padding_width = 0
padding_height = target_height - int(orig_height * ratio_width)
else:
padding_width = target_width - int(orig_width * ratio_height)
padding_height = 0
return padding_height, padding_width
def get_navit_vision_model(layer_idx: int = -1, **kwargs):
vision_config = {
"hidden_size": 1152,
"image_size": 448,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
}
model_config = SiglipVisionConfig(**vision_config, **kwargs)
if layer_idx < 0:
num_hidden_layers = model_config.num_hidden_layers \
+ layer_idx + 1
else:
num_hidden_layers = layer_idx + 1
vision_model = Idefics2VisionTransformer(
config=model_config,
require_post_norm=False,
num_hidden_layers_override=num_hidden_layers,
)
return vision_model
class Phi4MMImageEncoder(nn.Module):
"""Image embedding."""
def __init__(self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
prefix: str = "",
model_dir: str = "") -> None:
super().__init__()
# n_embed or hidden_size
hidden_size = config.n_embd if hasattr(
config, 'n_embd') else config.hidden_size
# layer_idx to output the img features
if isinstance(config.img_processor, dict):
self.layer_idx = config.img_processor.get('layer_idx', -2)
self.type_feature = config.img_processor.get(
'type_feature', 'patch')
else:
self.layer_idx = -2
self.type_feature = 'patch'
self.img_processor = get_navit_vision_model(layer_idx=self.layer_idx)
pe_weight = self.img_processor.embeddings.position_embedding.weight
L, D = pe_weight.size()
H = int(math.sqrt(L))
assert H**2 == L, f'position embedding size {L} is not square'
if H % 2 != 0:
self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
H += 1
image_dim_out = D
# ((448/14)//2)**2
self.num_img_tokens = (H // 2)**2
self.base_feat_height_target = H
self.image_dim_out = image_dim_out
self.img_sizes = None
self.image_attention_mask = None
# global_gn and sub_gn for hd transform, serves as line separator
self.use_hd_transform = True
self.with_learnable_separator = True
self.hd_transform_order = "sub_glb"
self.freeze_img_processor = False
self.crop_size = 448
# image token compression
self.image_token_compression_cls = 'avg_pool_2d'
self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
self.base_feat_height_reduction = 1
self.base_feat_height_target = self.base_feat_height_target // 2
# with_hd_transform and with_learnable_separator should have same value
assert self.use_hd_transform == self.with_learnable_separator, \
'use_hd_transform and with_learnable_separator should have same value'
assert self.use_hd_transform, \
'learnable separator is only for hd transform'
# 1024 * 4, merge spatial to channel dimension
self.glb_GN = nn.Parameter(
torch.zeros([
1, 1, self.image_dim_out * self.base_feat_height_reduction**2
]))
self.sub_GN = nn.Parameter(
torch.zeros([
1, 1, 1,
self.image_dim_out * self.base_feat_height_reduction**2
]))
dim_projection = hidden_size
depth = 2
layers = [
nn.Linear(image_dim_out * self.base_feat_height_reduction**2,
dim_projection)
]
for _ in range(1, depth):
layers.extend(
[nn.GELU(),
nn.Linear(dim_projection, dim_projection)])
self.img_projection = nn.Sequential(*layers)
self.vocab_size = config.vocab_size
self.img_features = None
self.use_out_place_operations = False
def get_img_features(self,
img_embeds: torch.FloatTensor,
attention_mask=None) -> torch.FloatTensor:
img_feature = self.img_processor(img_embeds,
patch_attention_mask=attention_mask)
if self.type_feature == "patch":
patch_feature = img_feature
use_token_compression = self.image_token_compression is not None
use_padding = getattr(self, 'img_processor_padding',
None) is not None
if use_token_compression or use_padding:
# reshape to 2D tensor
width = int(math.sqrt(patch_feature.size(1)))
patch_feature = patch_feature.view(-1, width, width,
patch_feature.size(-1))
# convert to NCHW
patch_feature = patch_feature.permute(0, 3, 1, 2)
if use_padding:
patch_feature = self.img_processor_padding(patch_feature)
if use_token_compression:
patch_feature = self.image_token_compression(patch_feature)
# convert to NHWC
patch_feature = patch_feature.permute(0, 2, 3, 1)
patch_feature = patch_feature.view(
-1,
patch_feature.size(1) * patch_feature.size(2),
patch_feature.size(-1))
return patch_feature
raise NotImplementedError
def forward(self, pixel_values: torch.FloatTensor,
image_sizes: torch.Tensor,
image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]:
"""
process image and return vision embeddings.
pixel_values: (num_images, num_crops, c, h, w)
image_sizes: [[h1, w1], [h2, w2]]
image_attention_mask: num_images x num_crops x 32 x 32
output: (num_images, num_img_tokens, hidden_size)
"""
# eg
# pixel_values: torch.Size([1, 7, 3, 448, 448])
# image_sizes: tensor([[ 896, 1344]], device='cuda:0')
# output: torch.Size([1, 1841, 3072])
if isinstance(self.img_projection, nn.Sequential):
target_device = self.img_projection[0].bias.device
target_dtype = self.img_projection[0].bias.dtype
else: # It's a single nn.Linear layer
target_device = self.img_projection.bias.device
target_dtype = self.img_projection.bias.dtype
img_sizes = image_sizes
num_images, num_crops, c, h, w = pixel_values.shape
bs = num_images
pixel_values = pixel_values.flatten(0, 1)
img_features = self.get_img_features(
pixel_values,
image_attention_mask.type(torch.BoolTensor).flatten(
0, 1).to(target_device))
base_feat_height_target = self.base_feat_height_target
base_resolution = self.crop_size
base_feat_height_reduction = self.base_feat_height_reduction
base_feat_height = base_feat_width = int(np.sqrt(
img_features.shape[1]))
assert base_feat_height == base_feat_height_target \
and base_feat_width == base_feat_height_target, \
(f"base_feat_height: {base_feat_height}, "
f"base_feat_width: {base_feat_width}, "
f"expect {base_feat_height_target} features for hd transform")
# bs x max_num_crops x (24x24) x C
img_features = img_features.view(bs, -1,
base_feat_height * base_feat_width,
self.image_dim_out)
C = self.image_dim_out
H = base_feat_height
output_imgs = []
output_len = []
# training is tensor, inference is list
if isinstance(img_sizes, torch.Tensor):
img_sizes = img_sizes.view(-1, 2)
for _bs in range(bs):
h, w = img_sizes[_bs]
h = h // base_resolution
w = w // base_resolution
B_ = h * w
# 1 x (24x24) x 1024
global_img_feature = img_features[_bs, :1]
# 1 x 12 x 12 x 4096
glb_img = global_img_feature.reshape(1, H, H, C).reshape(
1, H // base_feat_height_reduction, base_feat_height_reduction,
H // base_feat_height_reduction, base_feat_height_reduction,
C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
1, H // base_feat_height_reduction,
H // base_feat_height_reduction,
base_feat_height_reduction * base_feat_height_reduction *
C).contiguous()
temp_glb_GN = self.sub_GN.repeat(1,
H // base_feat_height_reduction,
1, 1)
# 1 x 156 x 4096
glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(
1, -1,
base_feat_height_reduction * base_feat_height_reduction * C)
# (max_num_crops-1) x (12x12) x C
sub_img = img_features[_bs, 1:]
# 16x574x1024
# get rid of padding sub_img
sub_img = sub_img[:B_]
# (num_crops, 12, 2, 12, 2, 1024) ->
# (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
sub_img = sub_img.reshape(B_, H, H, C).reshape(
B_, H // base_feat_height_reduction,
base_feat_height_reduction, H // base_feat_height_reduction,
base_feat_height_reduction,
C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
B_, -1, base_feat_height_reduction *
base_feat_height_reduction * C).contiguous()
sub_img = sub_img.reshape(
1, h, w, base_feat_height // base_feat_height_reduction,
base_feat_width // base_feat_height_reduction,
-1).permute(0, 1, 3, 2, 4, 5).reshape(
1, h * base_feat_height // base_feat_height_reduction,
w * base_feat_width // base_feat_height_reduction,
base_feat_height_reduction * base_feat_height_reduction *
C)
if image_attention_mask is not None and len(
image_attention_mask) > 0:
reshaped_image_attention_mask = image_attention_mask[
_bs, 1:B_ + 1, 0::2, 0::2].reshape(
1, h, w,
base_feat_height // base_feat_height_reduction,
base_feat_width // base_feat_height_reduction).permute(
0, 1, 3, 2, 4).reshape(
1, h * base_feat_height //
base_feat_height_reduction, w *
base_feat_width // base_feat_height_reduction)
useful_height = int(
reshaped_image_attention_mask[0, :, 0].sum().item())
useful_width = int(
reshaped_image_attention_mask[0, 0, :].sum().item())
sub_img = sub_img[:, :useful_height, :useful_width]
temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
temp_len = int(
image_attention_mask[_bs, :B_ + 1, 0::2, 0::2].sum().item(
)) + (useful_height +
1) + base_feat_height // base_feat_height_reduction
else:
temp_sub_GN = self.sub_GN.repeat(
1, h * base_feat_height // base_feat_height_reduction, 1,
1)
temp_len = int((h * w + 1) * self.num_img_tokens + 1 +
(h + 1) * base_feat_height //
base_feat_height_reduction)
sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(
1, -1,
base_feat_height_reduction * base_feat_height_reduction * C)
# (1, num_img_tokens, 1024*4)
# glb + sub
if self.hd_transform_order == 'glb_sub':
output_imgs.append(
torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
elif self.hd_transform_order == 'sub_glb':
output_imgs.append(
torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
else:
raise NotImplementedError(
f'hd_transform_order = {self.hd_transform_order}, "\
"not implemented')
#temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
assert temp_len == output_imgs[-1].shape[
1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
"{output_imgs[-1].shape[1]}'
output_len.append(temp_len)
img_set_tensor = []
for _output_img in output_imgs:
img_feature_proj = self.img_projection(
_output_img.to(target_device).to(target_dtype))
img_set_tensor.append(img_feature_proj.squeeze(0))
return img_set_tensor
class Phi4MMImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: Union[torch.Tensor, list[torch.Tensor]]
"""
Shape:
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
Note that `num_patches` may be different per batch and image,
in which case the data is passed as a list instead of a batched tensor.
"""
image_sizes: torch.Tensor
"""
Shape: `(batch_size * num_images, 2)`
This should be in `(height, width)` format.
"""
num_img_tokens: list[int]
"""Shape: `(batch_size * num_images)`"""
image_attention_mask: torch.Tensor
"""Shape: `(batch_size * num_images, H_mask, W_mask)`"""
class Phi4MMAudioFeatureInputs(TypedDict):
type: Literal["audio_features"]
data: Union[torch.Tensor, list[torch.Tensor]]
"""Shape: `(batch_size * num_audios, 80, M)"""
class Phi4MMAudioEmbeddingInputs(TypedDict):
type: Literal["audio_embeds"]
data: NestedTensors
"""Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
def cat_with_pad(tensors, dim, padding_value=0):
"""
cat along dim, while pad to max for all other dims
"""
ndim = tensors[0].dim()
assert all(
t.dim() == ndim for t in
tensors[1:]), "All tensors must have the same number of dimensions"
out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
out_size[dim] = sum(t.shape[dim] for t in tensors)
output = tensors[0].new_full(out_size, padding_value)
index = 0
for t in tensors:
# Create a slice list where every dimension except dim is full slice
slices = [slice(0, t.shape[d]) for d in range(ndim)]
# Update only the concat dimension slice
slices[dim] = slice(index, index + t.shape[dim])
output[slices] = t
index += t.shape[dim]
return output
class Phi4MMProcessingInfo(BaseProcessingInfo):
@property
def image_tokens(self) -> list[str]:
return [f"<|image_{i+1}|>" for i in range(100)]
@property
def audio_tokens(self) -> list[str]:
return [f"<|audio_{i+1}|>" for i in range(100)]
def get_dynamic_hd(
self,
processor: Optional[ProcessorMixin] = None,
) -> int:
if processor is None:
processor = self.get_hf_processor()
image_processor = processor.image_processor
return image_processor.dynamic_hd
def get_feature_extractor(self,
**kwargs: object) -> SequenceFeatureExtractor:
return self.get_hf_processor(**kwargs).audio_processor
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"audio": None, "image": None}
def _find_target_aspect_ratio(
self,
orig_width: int,
orig_height: int,
image_size: int,
max_num: int,
min_num: int,
):
w_crop_num = math.ceil(orig_width / float(image_size))
h_crop_num = math.ceil(orig_height / float(image_size))
if w_crop_num * h_crop_num > max_num:
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set((i, j) for i in range(1, max_num + 1)
for j in range(1, max_num + 1)
if i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
image_processor = self.get_hf_processor().image_processor
target_aspect_ratio = image_processor.find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
orig_width,
orig_height,
image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
else:
target_width = image_size * w_crop_num
target_height = image_size * h_crop_num
target_aspect_ratio = (w_crop_num, h_crop_num)
return target_aspect_ratio, target_height, target_width
def _compute_num_image_tokens(
self,
orig_width: int,
orig_height: int,
dynamic_hd_size: int,
vit_image_size: int,
vit_patch_size: int,
token_compression_factor: int = 2,
):
"""
compute the number of tokens an image is expected to take up considering
the image encoder architecture and exclude output features containing
only padding pixels
for siglip, vit_image_size=448, vit_patch_size=14, so output will be
32x32 feature map
NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
"""
assert vit_image_size % vit_patch_size == 0, (
"vit_image_size must be divisible by vit_patch_size")
assert (vit_image_size // vit_patch_size %
token_compression_factor == 0), (
"vit_image_size // vit_patch_size must be divisible by "
"token_compression_factor")
target_aspect_ratio, target_height, target_width = (
self._find_target_aspect_ratio(orig_width,
orig_height,
vit_image_size,
dynamic_hd_size,
min_num=1))
assert target_aspect_ratio[0] * vit_image_size == target_width, (
f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
assert target_aspect_ratio[1] * vit_image_size == target_height, (
f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
assert (target_height % vit_image_size == 0
and target_width % vit_image_size == 0)
padding_height, padding_width = _get_padding_size(
orig_width, orig_height, target_height, target_width)
assert padding_width == 0 or padding_height == 0, \
"padding_width or padding_height must be 0"
target_feat_width = target_width // vit_patch_size
target_feat_height = target_height // vit_patch_size
if padding_width >= vit_patch_size:
assert padding_height == 0, "padding_height not 0"
non_pad_feat_width = target_feat_width - math.floor(
padding_width / vit_patch_size)
non_pad_feat_height = target_feat_height
elif padding_height >= vit_patch_size:
assert padding_width == 0, "padding_width not 0"
non_pad_feat_height = target_feat_height - math.floor(
padding_height / vit_patch_size)
non_pad_feat_width = target_feat_width
else:
# small padding shorter than a vit patch
non_pad_feat_width = target_feat_width
non_pad_feat_height = target_feat_height
feat_width = non_pad_feat_width // token_compression_factor
feat_height = non_pad_feat_height // token_compression_factor
# NOTE it's possible that the non-padding feature is not divisible
if non_pad_feat_width % token_compression_factor != 0:
feat_width += 1
if non_pad_feat_height % token_compression_factor != 0:
feat_height += 1
num_hd_patch_tokens = feat_width * feat_height
num_hd_newline_tokens = feat_height
vit_feature_size = vit_image_size // vit_patch_size
num_global_image_tokens = (vit_feature_size //
token_compression_factor)**2
num_sep_tokens = 1
num_global_image_newline_tokens = \
vit_feature_size // token_compression_factor
return (num_global_image_tokens + num_sep_tokens +
num_hd_patch_tokens + num_hd_newline_tokens +
num_global_image_newline_tokens)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
processor: Optional[ProcessorMixin] = None,
) -> int:
hf_config = self.get_hf_config()
vision_encoder_name = hf_config.img_processor
if vision_encoder_name is None:
vision_encoder_name = SIGLIP_NAME
prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
vision_encoder_name]
vit_image_size = prepro_config['vit_image_size']
vit_patch_size = prepro_config['vit_patch_size']
token_compression_factor = prepro_config['token_compression_factor']
dynamic_hd_size = self.get_dynamic_hd(processor=processor)
image_num_tokens = self._compute_num_image_tokens(
image_width,
image_height,
dynamic_hd_size=dynamic_hd_size,
vit_image_size=vit_image_size,
vit_patch_size=vit_patch_size,
token_compression_factor=token_compression_factor,
)
return image_num_tokens
def get_image_size_with_most_features(
self,
processor: Optional[ProcessorMixin] = None,
) -> ImageSize:
hf_config = self.get_hf_config()
vision_encoder_name = hf_config.img_processor
if vision_encoder_name is None:
vision_encoder_name = SIGLIP_NAME
prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
vision_encoder_name]
vit_image_size = prepro_config['vit_image_size']
max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
return ImageSize(height=max_side, width=vit_image_size)
def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
"""
Compute the output size of the `extract_features` method.
Args:
audio_len (int): Length of the input waveform in samples.
sr (float): Sampling rate of the waveform, either 16000 or 8000.
Returns:
tuple (int, int): Output size as (T, D), where:
T: Number of time frames.
D: Number of Mel filterbank bins (80).
"""
# Resample to 16000 or 8000 if needed
if sr > 16000:
audio_len //= sr // 16000
elif 8000 <= sr < 16000:
# We'll resample to 16K from 8K
audio_len *= 2
elif sr < 8000:
raise RuntimeError(f"Unsupported sample rate {sr}")
# Spectrogram parameters for 16 kHz
win_length = 400 # Frame length in samples
hop_length = 160 # Frame shift in samples
# Calculate number of frames (T)
num_frames = (audio_len - win_length) // hop_length + 1
if num_frames < 1:
raise ValueError("Waveform too short for given parameters.")
# Return time frames (T)
return num_frames
def _compute_audio_embed_size(self, audio_frames: int) -> int:
"""
Compute the audio embedding size based on the audio frames and
compression rate.
"""
hf_config = self.get_hf_config()
compression_rate = hf_config.embd_layer['audio_embd_layer'][
'compression_rate']
# NOTE: this is a hard-coded value but might be configurable
# in the future
qformer_compression_rate = 1
integer = audio_frames // compression_rate
remainder = audio_frames % compression_rate
result = integer if remainder == 0 else integer + 1
integer = result // qformer_compression_rate
remainder = result % qformer_compression_rate
# qformer compression
result = integer if remainder == 0 else integer + 1
return result
class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_audios = mm_counts.get("audio", 0)
num_images = mm_counts.get("image", 0)
image_tokens: list[str] = self.info.image_tokens[:num_images]
audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
return "".join(image_tokens + audio_tokens)
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0)
num_images = mm_counts.get("image", 0)
target_width, target_height = \
self.info.get_image_size_with_most_features()
mm_data = {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images),
"audio":
self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
num_audios=num_audios),
}
return mm_data
class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
def _get_data_parser(self) -> MultiModalDataParser:
feature_extractor = self.info.get_feature_extractor()
return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
audio_resample_method="scipy")
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
if not mm_data:
prompt_ids = self.info.get_tokenizer().encode(prompt)
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
if (audio_data := mm_data.get("audios", [])):
mm_data['audios'] = [(data, sr) for data in audio_data]
processed_outputs = super()._call_hf_processor(prompt, mm_data,
mm_kwargs, tok_kwargs)
num_img_tokens = [
self.info.get_num_image_tokens(image_width=img_size[0],
image_height=img_size[1])
for img_size in processed_outputs["image_sizes"]
]
processed_outputs["num_img_tokens"] = num_img_tokens
audio_features = processed_outputs['input_audio_embeds']
feature_sizes = [
self.info.get_audio_num_frames(len(audio), sr)
for audio in audio_data
]
processed_outputs['input_audio_embeds'] = [
audio_features[idx, :size]
for idx, size in enumerate(feature_sizes)
]
return processed_outputs
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(
input_image_embeds=MultiModalFieldConfig.batched("image"),
image_attention_mask=MultiModalFieldConfig.batched("image"),
image_sizes=MultiModalFieldConfig.batched("image"),
num_img_tokens=MultiModalFieldConfig.batched("image"),
input_audio_embeds=MultiModalFieldConfig.batched("audio"),
)
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
image_tokens: list[str] = self.info.image_tokens # type: ignore
audio_tokens: list[str] = self.info.audio_tokens # type: ignore
feature_extractor = self.info.get_feature_extractor(
**hf_processor_mm_kwargs)
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
def get_image_replacement_phi4mm(item_idx: int):
images = mm_items.get_items(
"image", (ImageEmbeddingItems, ImageProcessorItems))
if isinstance(images, ImageEmbeddingItems):
num_image_tokens = images.get_feature_size(item_idx)
else:
image_size = images.get_image_size(item_idx)
num_image_tokens = self.info.get_num_image_tokens(
image_width=image_size.width,
image_height=image_size.height,
processor=hf_processor,
)
return [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
def get_audio_replacement_phi4mm(item_idx: int):
audios = mm_items.get_items("audio", AudioProcessorItems)
# TODO(Isotr0py): support embedding inputs
audio_len = audios.get_audio_length(item_idx)
audio_frames = self.info.get_audio_num_frames(
audio_len, feature_extractor.sampling_rate)
audio_embed_size = self.info._compute_audio_embed_size(
audio_frames)
return [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
return [
PromptReplacement(
modality="image",
target=image_tokens.__getitem__,
replacement=get_image_replacement_phi4mm,
),
PromptReplacement(
modality="audio",
target=audio_tokens.__getitem__,
replacement=get_audio_replacement_phi4mm,
),
]
def _recompute_cached_prompt_update(
self,
cached_update: ResolvedPromptUpdate,
new_item_idx: int,
) -> ResolvedPromptUpdate:
new_update = super()._recompute_cached_prompt_update(
cached_update,
new_item_idx,
)
if cached_update.modality == "image":
image_tokens: list[str] = self.info.image_tokens # type: ignore
new_update = new_update.with_target(image_tokens[new_item_idx])
elif cached_update.modality == "audio":
audio_tokens: list[str] = self.info.audio_tokens # type: ignore
new_update = new_update.with_target(audio_tokens[new_item_idx])
return new_update
@MULTIMODAL_REGISTRY.register_processor(
Phi4MMMultiModalProcessor,
info=Phi4MMProcessingInfo,
dummy_inputs=Phi4MMDummyInputsBuilder,
)
class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
"""
Implements the Phi-4-multimodal-instruct model in vLLM.
"""
packed_modules_mapping = {
"qkv_proj": [
"qkv_proj",
],
"gate_up_proj": [
"gate_up_proj",
],
}
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_substr={
"base_layer.": "",
},
orig_to_new_prefix={
"model.embed_tokens_extend.audio_embed.audio_projection.vision.":
"embed_tokens_extend.audio_projection_for_vision.",
"model.embed_tokens_extend.audio_embed.audio_projection.speech.":
"embed_tokens_extend.audio_projection.",
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
"model.embed_tokens_extend.image_embed.": "vision_encoder.",
},
)
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return f"<|image_{i}|>"
if modality.startswith("audio"):
return f"<|audio_{i}|>"
raise ValueError("Only image or audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
multimodal_config = vllm_config.model_config.multimodal_config
assert multimodal_config, "multimodal_config is required"
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config
self.multimodal_config = multimodal_config
self.quant_config = quant_config
self.lora_config = lora_config
# Tensor/Pipeline parallel not supported for now.
assert get_pp_group(
).world_size == 1, "pipeline parallel is not supported"
self.vision_encoder = Phi4MMImageEncoder(
config,
quant_config,
prefix="model.vision_embed_tokens",
model_dir=config._name_or_path)
if isinstance(config.embd_layer["audio_embd_layer"], dict):
embedding_config = {
"embedding_cls":
config.embd_layer["audio_embd_layer"]["embedding_cls"],
**config.embd_layer["audio_embd_layer"],
}
else:
embedding_config = {
"embedding_cls": self.config.embd_layer["embedding_cls"]
}
self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)
self.model = LlamaModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"))
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
quant_config=quant_config,
)
if config.tie_word_embeddings:
self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size, logit_scale)
def _parse_and_validate_audio_input(
self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
"""
Parse and validate the audio input to the model. This handles both
audio features and audio embeddings, but only the former is used for
now.
Args:
kwargs (object): Keyword arguments.
Returns:
Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
"""
audio_features = kwargs.pop("input_audio_embeds", None)
audio_embeds = kwargs.pop("audio_embeds", None)
if audio_features is None and audio_embeds is None:
return None
if audio_features is not None:
if not isinstance(audio_features, (torch.Tensor, list)):
raise ValueError("Incorrect type of audio features. "
f"Got type: {type(audio_features)}")
return Phi4MMAudioFeatureInputs(type="audio_features",
data=flatten_bn(audio_features))
if audio_embeds is not None:
if not isinstance(audio_embeds, (torch.Tensor, list)):
raise ValueError("Incorrect type of audio embeds. "
f"Got type: {type(audio_embeds)}")
return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
data=audio_embeds)
raise AssertionError("This line should be unreachable.")
def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
audio_projection_mode: str) -> NestedTensors:
"""
Create the audio embeddings from the audio input, where the audio input
is pairs of audio features and audio embed lengths. The audio input is
created by `input_mapper_for_phi4mm_audio`.
Args:
audio_input (Phi4MMAudioInputs): Audio input.
Returns:
NestedTensors: Audio embeddings
"""
if audio_input["type"] == "audio_embeds":
return audio_input["data"]
audio_features = audio_input["data"]
# (e.g. multiple examples) and the second dim is the multi-audio dim
# (e.g. multiple audios in the same example)
dtype = next(self.embed_tokens_extend.parameters()).dtype
audio_embeds = [
self.embed_tokens_extend(
features.to(dtype),
audio_projection_mode=audio_projection_mode,
) for features in audio_features
]
return audio_embeds
def _parse_and_validate_image_input(self,
**kwargs: object) -> Optional[dict]:
input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
if input_image_embeds is None:
return None
image_sizes = kwargs.get("image_sizes")
image_attention_mask = kwargs.get("image_attention_mask")
num_img_tokens = kwargs.get("num_img_tokens")
assert image_sizes is not None and image_attention_mask is not None\
and num_img_tokens is not None, "Missing image inputs"
if is_list_of(input_image_embeds, torch.Tensor):
assert all(p.dim() == 5
for p in input_image_embeds), "Incorrect image inputs"
# list len is batch_size.
# each tensor has dimension: num_img_per_example, num_hd_patches,
# channels, height, width.
# need to pad along num_hd_patches.
# mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
input_image_embeds = cat_with_pad(input_image_embeds, dim=0)
elif isinstance(input_image_embeds, torch.Tensor):
# dimension: batch_size, num_img_per_example, num_hd_patches,
# channels, height, width.
# we flatten first 2 dims to make it a single large batch for
# SigLIP Encoder.
assert input_image_embeds.dim() == 6, "Incorrect image inputs"
input_image_embeds = input_image_embeds.flatten(0, 1)
else:
raise ValueError("Incorrect input_image_embeds inputs")
if isinstance(image_attention_mask, list):
image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
elif isinstance(image_attention_mask, torch.Tensor):
image_attention_mask = image_attention_mask.flatten(0, 1)
else:
raise ValueError("Incorrect image_attention_mask inputs")
if isinstance(image_sizes, list):
image_sizes = torch.cat(image_sizes, dim=0)
elif isinstance(image_sizes, torch.Tensor):
image_sizes = image_sizes.flatten(0, 1)
else:
raise ValueError("Incorrect image_attention_mask inputs")
if isinstance(num_img_tokens, list):
num_img_tokens = [
n for num_tensor in num_img_tokens
for n in num_tensor.tolist()
]
elif isinstance(num_img_tokens, torch.Tensor):
num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
else:
raise ValueError("Incorrect image_attention_mask inputs")
return Phi4MMImagePixelInputs(
type="pixel_values",
data=input_image_embeds,
image_sizes=image_sizes,
image_attention_mask=image_attention_mask,
num_img_tokens=num_img_tokens,
)
def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
modalities = {}
# Preserve the order of modalities if there are multiple of them
# from the order of kwargs.
for input_key in kwargs:
if input_key in ("input_image_embeds",
"image_embeds") and "images" not in modalities:
modalities["images"] = self._parse_and_validate_image_input(
**kwargs)
if input_key in ("input_audio_embeds",
"audio_embeds") and "audios" not in modalities:
modalities["audios"] = self._parse_and_validate_audio_input(
**kwargs)
return modalities
def _process_image_input(
self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
dtype = next(self.vision_encoder.parameters()).dtype
pixel_values = image_input['data'].to(dtype)
image_sizes = image_input['image_sizes']
image_attention_mask = image_input['image_attention_mask']
image_embeds = self.vision_encoder(pixel_values, image_sizes,
image_attention_mask)
return image_embeds
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
if not modalities:
return []
return None
# The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video).
multimodal_embeddings: tuple[torch.Tensor, ...] = ()
# NOTE: It is important to iterate over the keys in this dictionary
# to preserve the order of the modalities.
audio_projection_mode = 'speech'
for modality in modalities:
# make sure process images first
if modality == "images":
audio_projection_mode = "vision"
image_input = modalities["images"]
vision_embeddings = self._process_image_input(image_input)
multimodal_embeddings += tuple(vision_embeddings)
if modality == "audios":
audio_input = modalities["audios"]
audio_embeddings = self._process_audio_input(
audio_input, audio_projection_mode=audio_projection_mode)
multimodal_embeddings += tuple(audio_embeddings)
return multimodal_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.model.embed_tokens(input_ids)
if multimodal_embeddings is not None and len(
multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
return inputs_embeds
def get_input_embeddings_v0(
self,
input_ids: torch.Tensor,
image_input: Optional[Phi4MMImagePixelInputs] = None,
audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
) -> torch.Tensor:
audio_projection_mode = 'speech'
inputs_embeds = self.get_input_embeddings(input_ids)
if image_input is not None:
image_embeds = self._process_image_input(image_input)
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
image_embeds,
placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
)
audio_projection_mode = 'vision'
if audio_input is not None:
audio_embeds = self._process_audio_input(
audio_input, audio_projection_mode=audio_projection_mode)
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
audio_embeds,
placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
)
return inputs_embeds
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object,
) -> torch.Tensor:
if intermediate_tensors is not None:
inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner from
# `get_multimodal_embeddings` and `get_input_embeddings`, this
# condition is only for v0 compatibility.
elif inputs_embeds is None:
image_input = self._parse_and_validate_image_input(**kwargs)
audio_input = self._parse_and_validate_audio_input(**kwargs)
if image_input is None and audio_input is None:
inputs_embeds = None
else:
inputs_embeds = self.get_input_embeddings_v0(
input_ids,
image_input=image_input,
audio_input=audio_input)
input_ids = None
hidden_states = self.model(
input_ids,
positions,
intermediate_tensors,
inputs_embeds=inputs_embeds,
)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
return logits
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> None:
loader = AutoWeightsLoader(self, skip_substrs=["lora"])
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
def get_mm_mapping(self) -> MultiModelKeys:
"""
Get the module prefix in multimodal models
"""
return MultiModelKeys.from_string_field(
language_model="model.",
connector=["audio_projection_for_vision", "audio_projection"],
tower_model=["vision_encoder", "embed_tokens_extend"],
)
def get_language_model(self) -> torch.nn.Module:
return self.model