mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Model] Remove unnecessary CUDA sync of Qwen2VL image and video preprocess (#24334)
Signed-off-by: Win <chatcharinsang@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
committed by
GitHub
parent
8a46602606
commit
60f0843ef8
@ -1218,6 +1218,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
grid_thw = image_input["image_grid_thw"]
|
||||
assert grid_thw.ndim == 2
|
||||
grid_thw_list = grid_thw.tolist()
|
||||
|
||||
if image_input["type"] == "image_embeds":
|
||||
image_embeds = image_input["image_embeds"]
|
||||
@ -1227,15 +1228,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
# Split concatenated embeddings for each image item.
|
||||
merge_size = self.visual.spatial_merge_size
|
||||
sizes = grid_thw.prod(-1) // merge_size // merge_size
|
||||
sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
|
||||
(merge_size * merge_size)).tolist()
|
||||
|
||||
return image_embeds.split(sizes.tolist())
|
||||
return image_embeds.split(sizes)
|
||||
|
||||
def _process_video_input(
|
||||
self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
|
||||
|
||||
grid_thw = video_input["video_grid_thw"]
|
||||
assert grid_thw.ndim == 2
|
||||
grid_thw_list = grid_thw.tolist()
|
||||
|
||||
if video_input["type"] == "video_embeds":
|
||||
video_embeds = video_input["video_embeds"]
|
||||
@ -1245,9 +1248,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
# Split concatenated embeddings for each video item.
|
||||
merge_size = self.visual.spatial_merge_size
|
||||
sizes = grid_thw.prod(-1) // merge_size // merge_size
|
||||
sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
|
||||
(merge_size * merge_size)).tolist()
|
||||
|
||||
return video_embeds.split(sizes.tolist())
|
||||
return video_embeds.split(sizes)
|
||||
|
||||
def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
|
||||
modalities = {}
|
||||
|
Reference in New Issue
Block a user