mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-10-20 23:03:52 +08:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			v0.11.0rc3
			...
			pil_image
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| f96a3cc713 | |||
| 32c0155774 | 
| @ -2,6 +2,7 @@ | ||||
| import weakref | ||||
|  | ||||
| import pytest | ||||
| from PIL import Image | ||||
|  | ||||
| from vllm import LLM | ||||
| from vllm.distributed import cleanup_dist_env_and_memory | ||||
| @ -118,6 +119,29 @@ def test_chat_multi_image(vision_llm, image_urls: list[str]): | ||||
|     assert len(outputs) >= 0 | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("image_urls", | ||||
|                          [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) | ||||
| def test_chat_multi_pil_image(vision_llm, image_urls: list[str]): | ||||
|     images = [Image.open(image_url) for image_url in image_urls] | ||||
|  | ||||
|     messages = [{ | ||||
|         "role": | ||||
|         "user", | ||||
|         "content": [ | ||||
|             *({ | ||||
|                 "type": "image", | ||||
|                 "image": image | ||||
|             } for image in images), | ||||
|             { | ||||
|                 "type": "text", | ||||
|                 "text": "What's in this image?" | ||||
|             }, | ||||
|         ], | ||||
|     }] | ||||
|     outputs = vision_llm.chat(messages) | ||||
|     assert len(outputs) >= 0 | ||||
|  | ||||
|  | ||||
| def test_llm_chat_tokenization_no_double_bos(text_llm): | ||||
|     """ | ||||
|     LLM.chat() should not add special tokens when using chat templates. | ||||
|  | ||||
| @ -27,6 +27,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam, | ||||
|                                ChatCompletionToolMessageParam) | ||||
| from openai.types.chat.chat_completion_content_part_input_audio_param import ( | ||||
|     InputAudio) | ||||
| from PIL import Image | ||||
| from pydantic import TypeAdapter | ||||
| # yapf: enable | ||||
| from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast, | ||||
| @ -87,6 +88,20 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): | ||||
|     """The type of the content part.""" | ||||
|  | ||||
|  | ||||
| class PILImage(TypedDict, total=False): | ||||
|     image: Required[Image.Image] | ||||
|     """ | ||||
|     A PIL.Image.Image object. | ||||
|     """ | ||||
|  | ||||
|  | ||||
| class ChatCompletionContentPartPILImageParam(TypedDict, total=False): | ||||
|     image: Required[PILImage] | ||||
|  | ||||
|     type: Required[Literal["image"]] | ||||
|     """The type of the content part.""" | ||||
|  | ||||
|  | ||||
| class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): | ||||
|     """A simpler version of the param that only accepts a plain image_url. | ||||
|     This is supported by OpenAI API, although it is not documented. | ||||
| @ -124,6 +139,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): | ||||
| ChatCompletionContentPartParam: TypeAlias = Union[ | ||||
|     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, | ||||
|     ChatCompletionContentPartInputAudioParam, | ||||
|     ChatCompletionContentPartPILImageParam, | ||||
|     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, | ||||
|     CustomChatCompletionContentSimpleImageParam, | ||||
|     ChatCompletionContentPartImageEmbedsParam, | ||||
| @ -680,6 +696,10 @@ class BaseMultiModalContentParser(ABC): | ||||
|                            image_embeds: Union[str, dict[str, str]]) -> None: | ||||
|         raise NotImplementedError | ||||
|  | ||||
|     @abstractmethod | ||||
|     def parse_pil_image(self, image: Image.Image) -> None: | ||||
|         raise NotImplementedError | ||||
|  | ||||
|     @abstractmethod | ||||
|     def parse_audio(self, audio_url: str) -> None: | ||||
|         raise NotImplementedError | ||||
| @ -710,6 +730,10 @@ class MultiModalContentParser(BaseMultiModalContentParser): | ||||
|         placeholder = self._tracker.add("image", image) | ||||
|         self._add_placeholder(placeholder) | ||||
|  | ||||
|     def parse_pil_image(self, image: Image.Image) -> None: | ||||
|         placeholder = self._tracker.add("image", image) | ||||
|         self._add_placeholder(placeholder) | ||||
|  | ||||
|     def parse_image_embeds(self, | ||||
|                            image_embeds: Union[str, dict[str, str]]) -> None: | ||||
|         if isinstance(image_embeds, dict): | ||||
| @ -761,6 +785,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): | ||||
|         placeholder = self._tracker.add("image", image_coro) | ||||
|         self._add_placeholder(placeholder) | ||||
|  | ||||
|     def parse_pil_image(self, image: Image.Image) -> None: | ||||
|         placeholder = self._tracker.add("image", image) | ||||
|         self._add_placeholder(placeholder) | ||||
|  | ||||
|     def parse_image_embeds(self, | ||||
|                            image_embeds: Union[str, dict[str, str]]) -> None: | ||||
|         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() | ||||
| @ -902,6 +930,8 @@ _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) | ||||
| _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python | ||||
| _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python | ||||
| _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python | ||||
| # Parser for supporting raw multimodal data format | ||||
| _PILImageParser = TypeAdapter(ChatCompletionContentPartPILImageParam).validate_python # noqa: E501 | ||||
|  | ||||
| _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio] | ||||
|  | ||||
| @ -912,6 +942,8 @@ MM_PARSER_MAP: dict[ | ||||
| ] = { | ||||
|     "text": | ||||
|     lambda part: _TextParser(part).get("text", None), | ||||
|     "image": | ||||
|     lambda part: _PILImageParser(part).get("image", None), | ||||
|     "image_url": | ||||
|     lambda part: _ImageParser(part).get("image_url", {}).get("url", None), | ||||
|     "image_embeds": | ||||
| @ -985,7 +1017,7 @@ def _parse_chat_message_content_mm_part( | ||||
|  | ||||
|  | ||||
| VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", | ||||
|                                        "image_embeds", | ||||
|                                        "image_embeds", "image", | ||||
|                                        "audio_url", "input_audio", "video_url") | ||||
|  | ||||
|  | ||||
| @ -1056,6 +1088,10 @@ def _parse_chat_message_content_part( | ||||
|         else: | ||||
|             return str_content | ||||
|  | ||||
|     if part_type == "image": | ||||
|         image = cast(Image.Image, content) | ||||
|         mm_parser.parse_pil_image(image) | ||||
|         return {'type': 'image'} if wrap_dicts else None | ||||
|     if part_type == "image_url": | ||||
|         str_content = cast(str, content) | ||||
|         mm_parser.parse_image(str_content) | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	