[Doc] Improve MM Pooling model documentation (#25966)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-01 02:58:29 +08:00
committed by GitHub
parent e6a226efba
commit 2f652e6cdf
9 changed files with 292 additions and 100 deletions

View File

@ -428,7 +428,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
### Image Inputs

View File

@ -626,7 +626,29 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
!!! note
vLLM currently only supports adding LoRA to the language backbone of multimodal models.
vLLM currently only supports dynamic LoRA adapters on the language backbone of multimodal models.
If you wish to use a model with LoRA in the multi-modal encoder,
please merge the weights into the base model first before running it in vLLM like a regular model.
```python
from peft import PeftConfig, PeftModel
from transformers import AutoModelForImageTextToText, AutoProcessor
def merge_and_save(model_id: str, output_dir: str):
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
lora_model = PeftModel.from_pretrained(
base_model,
model_id,
config=PeftConfig.from_pretrained(model_id),
)
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
model._hf_peft_config_loaded = False # Needed to save the merged model
processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)
```
### Generative Models
@ -805,8 +827,8 @@ The following table lists those that are tested in vLLM.
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))

View File

@ -236,11 +236,33 @@ The following extra parameters are supported:
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
which will be treated as a single prompt to the model.
Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py>
If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
??? code
```python
from openai import OpenAI
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import ChatCompletionMessageParam
from openai.types.create_embedding_response import CreateEmbeddingResponse
def create_chat_embeddings(
client: OpenAI,
*,
messages: list[ChatCompletionMessageParam],
model: str,
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
) -> CreateEmbeddingResponse:
return client.post(
"/embeddings",
cast_to=CreateEmbeddingResponse,
body={"messages": messages, "model": model, "encoding_format": encoding_format},
)
```
#### Multi-modal inputs
You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
@ -254,7 +276,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/template_vlm2vec.jinja
--chat-template examples/template_vlm2vec_phi3v.jinja
```
!!! important
@ -262,34 +284,36 @@ and passing a list of `messages` in the request. Refer to the examples below for
to run this model in embedding mode instead of text generation mode.
The custom chat template is completely different from the original one for this model,
and can be found here: <gh-file:examples/template_vlm2vec.jinja>
and can be found here: <gh-file:examples/template_vlm2vec_phi3v.jinja>
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? code
```python
import requests
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="EMPTY",
)
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model": "TIGER-Lab/VLM2Vec-Full",
"messages": [{
response = create_chat_embeddings(
client,
model="TIGER-Lab/VLM2Vec-Full",
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
],
}],
"encoding_format": "float",
},
}
],
encoding_format="float",
)
response.raise_for_status()
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])
print("Image embedding output:", response.data[0].embedding)
```
=== "DSE-Qwen2-MRL"

View File

@ -10,6 +10,7 @@ on HuggingFace model repository.
from argparse import Namespace
from dataclasses import asdict
from pathlib import Path
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from PIL.Image import Image
@ -19,6 +20,9 @@ from vllm.entrypoints.score_utils import ScoreMultiModalParam
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
ROOT_DIR = Path(__file__).parent.parent.parent
EXAMPLES_DIR = ROOT_DIR / "examples"
class TextQuery(TypedDict):
modality: Literal["text"]
@ -82,23 +86,27 @@ def run_e5_v(query: Query) -> ModelRequestData:
)
def run_vlm2vec(query: Query) -> ModelRequestData:
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
image = None
elif query["modality"] == "image":
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = (
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
)
prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
raise ValueError(f"Unsupported query modality: {modality!r}")
return prompt, image
def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
engine_args = EngineArgs(
model="TIGER-Lab/VLM2Vec-Full",
@ -116,6 +124,66 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
)
def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
# vLLM does not support LoRA adapters on multi-modal encoder,
# so we merge the weights first
from huggingface_hub.constants import HF_HUB_CACHE
from peft import PeftConfig, PeftModel
from transformers import AutoModelForImageTextToText, AutoProcessor
from vllm.entrypoints.chat_utils import load_chat_template
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
lora_model = PeftModel.from_pretrained(
base_model,
model_id,
config=PeftConfig.from_pretrained(model_id),
)
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
model._hf_peft_config_loaded = False # Needed to save the merged model
processor = AutoProcessor.from_pretrained(
model_id,
# `min_pixels` and `max_pixels` are deprecated
size={"shortest_edge": 3136, "longest_edge": 12845056},
)
processor.chat_template = load_chat_template(
# The original chat template is not correct
EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
)
merged_path = str(
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
)
print(f"Saving merged model to {merged_path}...")
print(
"NOTE: This directory is not tracked by `huggingface_hub` "
"so you have to delete this manually if you don't want it anymore."
)
model.save_pretrained(merged_path)
processor.save_pretrained(merged_path)
print("Done!")
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
engine_args = EngineArgs(
model=merged_path,
runner="pooling",
max_model_len=4096,
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def run_jinavl_reranker(query: Query) -> ModelRequestData:
if query["modality"] != "text+images":
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
@ -232,7 +300,8 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):
model_example_map = {
"e5_v": run_e5_v,
"vlm2vec": run_vlm2vec,
"vlm2vec_phi3v": run_vlm2vec_phi3v,
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
"jinavl_reranker": run_jinavl_reranker,
}
@ -246,7 +315,7 @@ def parse_args():
"--model-name",
"-m",
type=str,
default="vlm2vec",
default="vlm2vec_phi3v",
choices=model_example_map.keys(),
help="The name of the embedding model.",
)

View File

@ -4,69 +4,137 @@
"""Example Python client for multimodal embedding API using vLLM API server
NOTE:
start a supported multimodal embeddings model server with `vllm serve`, e.g.
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
vllm serve TIGER-Lab/VLM2Vec-Full \
--runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/template_vlm2vec_phi3v.jinja
"""
import argparse
import base64
import io
from typing import Literal, Union
import requests
from openai import OpenAI
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import ChatCompletionMessageParam
from openai.types.create_embedding_response import CreateEmbeddingResponse
from PIL import Image
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
def vlm2vec():
response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model": "TIGER-Lab/VLM2Vec-Full",
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
],
}
],
"encoding_format": "float",
},
def create_chat_embeddings(
client: OpenAI,
*,
messages: list[ChatCompletionMessageParam],
model: str,
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
) -> CreateEmbeddingResponse:
"""
Convenience function for accessing vLLM's Chat Embeddings API,
which is an extension of OpenAI's existing Embeddings API.
"""
return client.post(
"/embeddings",
cast_to=CreateEmbeddingResponse,
body={"messages": messages, "model": model, "encoding_format": encoding_format},
)
response.raise_for_status()
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])
def dse_qwen2_vl(inp: dict):
# Embedding an Image
if inp["type"] == "image":
messages = [
def run_vlm2vec(client: OpenAI, model: str):
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
],
}
],
model=model,
encoding_format="float",
)
print("Image embedding output:", response.data[0].embedding)
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "text",
"text": "Represent the given image with the following question: What is in the image.",
},
],
}
],
model=model,
encoding_format="float",
)
print("Image+Text embedding output:", response.data[0].embedding)
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "A cat and a dog"},
],
}
],
model=model,
encoding_format="float",
)
print("Text embedding output:", response.data[0].embedding)
def run_dse_qwen2_vl(client: OpenAI, model: str):
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": inp["image_url"],
"url": image_url,
},
},
{"type": "text", "text": "What is shown in this image?"},
],
}
]
# Embedding a Text Query
else:
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer = io.BytesIO()
image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png")
buffer.seek(0)
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
messages = [
],
model=model,
encoding_format="float",
)
print("Image embedding output:", response.data[0].embedding)
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer = io.BytesIO()
image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png")
buffer.seek(0)
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
@ -76,23 +144,21 @@ def dse_qwen2_vl(inp: dict):
"url": f"data:image/jpeg;base64,{image_placeholder}",
},
},
{"type": "text", "text": f"Query: {inp['content']}"},
{"type": "text", "text": "Query: What is the weather like today?"},
],
}
]
response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model": "MrLight/dse-qwen2-2b-mrl-v1",
"messages": messages,
"encoding_format": "float",
},
],
model=model,
encoding_format="float",
)
response.raise_for_status()
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])
print("Text embedding output:", response.data[0].embedding)
model_example_map = {
"vlm2vec": run_vlm2vec,
"dse_qwen2_vl": run_dse_qwen2_vl,
}
def parse_args():
@ -103,29 +169,24 @@ def parse_args():
parser.add_argument(
"--model",
type=str,
choices=["vlm2vec", "dse_qwen2_vl"],
choices=model_example_map.keys(),
required=True,
help="Which model to call.",
help="The name of the embedding model.",
)
return parser.parse_args()
def main(args):
if args.model == "vlm2vec":
vlm2vec()
elif args.model == "dse_qwen2_vl":
dse_qwen2_vl(
{
"type": "image",
"image_url": image_url,
}
)
dse_qwen2_vl(
{
"type": "text",
"content": "What is the weather like today?",
}
)
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model_id = models.data[0].id
model_example_map[args.model](client, model_id)
if __name__ == "__main__":

View File

@ -0,0 +1,15 @@
{%- if messages | length > 1 -%}
{{ raise_exception('Embedding models should only embed one message at a time') }}
{%- endif -%}
{% set vars = namespace(parts=[]) %}
{%- for message in messages -%}
{%- for content in message['content'] -%}
{%- if content['type'] == 'text' -%}
{%- set vars.parts = vars.parts + [content['text']] %}
{%- elif content['type'] == 'image' -%}
{%- set vars.parts = vars.parts + ['<|image_pad|>'] %}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}
{{ vars.parts | join(' ') }}

View File

@ -14,7 +14,7 @@ from vllm.multimodal.utils import encode_image_base64, fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)

View File

@ -2468,7 +2468,8 @@ def test_resolve_content_format_fallbacks(model, expected_format):
("template_falcon.jinja", "string"),
("template_inkbot.jinja", "string"),
("template_teleflm.jinja", "string"),
("template_vlm2vec.jinja", "openai"),
("template_vlm2vec_phi3v.jinja", "openai"),
("template_vlm2vec_qwen2vl.jinja", "openai"),
("tool_chat_template_granite_20b_fc.jinja", "string"),
("tool_chat_template_hermes.jinja", "string"),
("tool_chat_template_internlm2_tool.jinja", "string"),