mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
17 Commits
Author | SHA1 | Date | |
---|---|---|---|
37e3806132 | |||
c0efdd655b | |||
aaaec52ad9 | |||
e1eb45d397 | |||
89fca671fb | |||
d20b0c139c | |||
166a168b0f | |||
2bb0e1a799 | |||
6eaf1e5c52 | |||
868a8c5b2c | |||
b4ad56c1bd | |||
69698f257e | |||
cd0cd85102 | |||
0a74bfce9c | |||
dd3b865854 | |||
9b87a579aa | |||
b539222d4e |
@ -200,6 +200,7 @@ steps:
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
@ -226,10 +227,13 @@ steps:
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/vision_language.py
|
||||
- python3 offline_inference/vision_language_multi_image.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
|
11
README.md
11
README.md
@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
|
||||
|
||||
Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
|
||||
|
||||
👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
|
||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
|
@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
||||
union tmpcvt {
|
||||
[[maybe_unused]] union tmpcvt {
|
||||
uint16_t u;
|
||||
_Float16 f;
|
||||
__hip_bfloat16 b;
|
||||
@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
||||
template <typename T>
|
||||
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
|
||||
const _B16x4& inp2) {
|
||||
union tmpcvt {
|
||||
[[maybe_unused]] union tmpcvt {
|
||||
uint16_t u;
|
||||
_Float16 f;
|
||||
__hip_bfloat16 b;
|
||||
@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
const int seq_idx = blockIdx.y;
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
const int laneid = threadIdx.x % WARP_SIZE;
|
||||
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
// max num partitions supported is warp_size * NPAR_LOOPS
|
||||
|
@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ⚠️
|
||||
*
|
||||
- * `GLM4VForCausalLM`<sup>^</sup>
|
||||
* GLM-4V
|
||||
* T + I
|
||||
@ -948,8 +948,11 @@ V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
|
||||
- Will be updated in the future to support the correct behavior
|
||||
- Does not support `"do_pan_and_scan": True`
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
|
@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
@ -23,21 +25,31 @@ question_per_audio_count = {
|
||||
2: "What sport and what nursery rhyme are referenced?"
|
||||
}
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# MiniCPM-O
|
||||
def run_minicpmo(question: str, audio_count: int):
|
||||
def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "openbmb/MiniCPM-o-2_6"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
llm = LLM(model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
stop_tokens = ['<|im_end|>', '<|endoftext|>']
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
chat_template=audio_chat_template)
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# Phi-4-multimodal-instruct
|
||||
def run_phi4mm(questions: str, audio_count: int):
|
||||
def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process audio inputs.
|
||||
@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int):
|
||||
speech_lora_path = os.path.join(model_path, "speech-lora")
|
||||
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
|
||||
|
||||
prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
|
||||
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int):
|
||||
lora_extra_vocab_size=0,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
lora_request = LoRARequest("speech", 1, speech_lora_path)
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompts,
|
||||
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# Qwen2-Audio
|
||||
def run_qwen2_audio(question: str, audio_count: int):
|
||||
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
audio_in_prompt = "".join([
|
||||
f"Audio {idx+1}: "
|
||||
@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int):
|
||||
"<|im_start|>user\n"
|
||||
f"{audio_in_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# Ultravox 0.5-1B
|
||||
def run_ultravox(question: str, audio_count: int):
|
||||
def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int):
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# Whisper
|
||||
def run_whisper(question: str, audio_count: int):
|
||||
def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
assert audio_count == 1, (
|
||||
"Whisper only support single audio input per prompt")
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
|
||||
prompt = "<|startoftranscript|>"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=448,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=448,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -164,14 +194,24 @@ def main(args):
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
audio_count = args.num_audios
|
||||
llm, prompt, stop_token_ids = model_example_map[model](
|
||||
question_per_audio_count[audio_count], audio_count)
|
||||
req_data = model_example_map[model](question_per_audio_count[audio_count],
|
||||
audio_count)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=64,
|
||||
stop_token_ids=stop_token_ids)
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
@ -183,7 +223,7 @@ def main(args):
|
||||
}
|
||||
|
||||
assert args.num_prompts > 0
|
||||
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
|
||||
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
|
||||
if args.num_prompts > 1:
|
||||
# Batch inference
|
||||
inputs = [inputs] * args.num_prompts
|
||||
@ -214,6 +254,10 @@ if __name__ == "__main__":
|
||||
default=1,
|
||||
choices=[0, 1, 2],
|
||||
help="Number of audio items per prompt.")
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
|
||||
the explicit/implicit prompt format on enc-dec LMMs for text generation.
|
||||
"""
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, PromptType, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompts: Sequence[PromptType]
|
||||
|
||||
|
||||
def run_florence2():
|
||||
# Create a Florence-2 encoder/decoder model instance
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
@ -39,12 +46,15 @@ def run_florence2():
|
||||
"decoder_prompt": "",
|
||||
},
|
||||
]
|
||||
return llm, prompts
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_mllama():
|
||||
# Create a Mllama encoder/decoder model instance
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -69,12 +79,15 @@ def run_mllama():
|
||||
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
|
||||
},
|
||||
]
|
||||
return llm, prompts
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_whisper():
|
||||
# Create a Whisper encoder/decoder model instance
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
max_model_len=448,
|
||||
max_num_seqs=16,
|
||||
@ -99,7 +112,11 @@ def run_whisper():
|
||||
"decoder_prompt": "<|startoftranscript|>",
|
||||
}
|
||||
]
|
||||
return llm, prompts
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -114,7 +131,12 @@ def main(args):
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
llm, prompts = model_example_map[model]()
|
||||
req_data = model_example_map[model]()
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
prompts = req_data.prompts
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(
|
||||
@ -153,6 +175,10 @@ if __name__ == "__main__":
|
||||
default="mllama",
|
||||
choices=model_example_map.keys(),
|
||||
help='Huggingface "model_type".')
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -8,122 +8,164 @@ on HuggingFace model repository.
|
||||
"""
|
||||
import os
|
||||
import random
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompts: list[str]
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Aria
|
||||
def run_aria(questions: list[str], modality: str):
|
||||
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "rhymes-ai/Aria"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
|
||||
"<|im_end|>\n<|im_start|>assistant\n")
|
||||
for question in questions]
|
||||
|
||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# BLIP-2
|
||||
def run_blip2(questions: list[str], modality: str):
|
||||
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
|
||||
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
||||
prompts = [f"Question: {question} Answer:" for question in questions]
|
||||
llm = LLM(model="Salesforce/blip2-opt-2.7b",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="Salesforce/blip2-opt-2.7b",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Chameleon
|
||||
def run_chameleon(questions: list[str], modality: str):
|
||||
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"{question}<image>" for question in questions]
|
||||
llm = LLM(model="facebook/chameleon-7b",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="facebook/chameleon-7b",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Deepseek-VL2
|
||||
def run_deepseek_vl2(questions: list[str], modality: str):
|
||||
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||
)
|
||||
|
||||
prompts = [
|
||||
f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
|
||||
for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Florence2
|
||||
def run_florence2(question: str, modality: str):
|
||||
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
llm = LLM(model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompt = "<MORE_DETAILED_CAPTION>"
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Fuyu
|
||||
def run_fuyu(questions: list[str], modality: str):
|
||||
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"{question}\n" for question in questions]
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Gemma 3
|
||||
def run_gemma3(questions: list[str], modality: str):
|
||||
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str):
|
||||
prompts = [("<bos><start_of_turn>user\n"
|
||||
f"<start_of_image>{question}<end_of_turn>\n"
|
||||
"<start_of_turn>model\n") for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# GLM-4v
|
||||
def run_glm4v(questions: list[str], modality: str):
|
||||
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "THUDM/glm-4v-9b"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompts = [
|
||||
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
|
||||
@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str):
|
||||
]
|
||||
|
||||
stop_token_ids = [151329, 151336, 151338]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# H2OVL-Mississippi
|
||||
def run_h2ovl(questions: list[str], modality: str):
|
||||
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "h2oai/h2ovl-mississippi-800m"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str):
|
||||
# Stop tokens for H2OVL-Mississippi
|
||||
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# Idefics3-8B-Llama3
|
||||
def run_idefics3(questions: list[str], modality: str):
|
||||
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str):
|
||||
prompts = [(
|
||||
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
|
||||
) for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# InternVL
|
||||
def run_internvl(questions: list[str], modality: str):
|
||||
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str):
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# LLaVA-1.5
|
||||
def run_llava(questions: list[str], modality: str):
|
||||
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [
|
||||
f"USER: <image>\n{question}\nASSISTANT:" for question in questions
|
||||
]
|
||||
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf",
|
||||
max_model_len=4096,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
max_model_len=4096,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LLaVA-1.6/LLaVA-NeXT
|
||||
def run_llava_next(questions: list[str], modality: str):
|
||||
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
|
||||
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LlaVA-NeXT-Video
|
||||
# Currently only support for video input
|
||||
def run_llava_next_video(questions: list[str], modality: str):
|
||||
def run_llava_next_video(questions: list[str],
|
||||
modality: str) -> ModelRequestData:
|
||||
assert modality == "video"
|
||||
|
||||
prompts = [
|
||||
f"USER: <video>\n{question} ASSISTANT:" for question in questions
|
||||
]
|
||||
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LLaVA-OneVision
|
||||
def run_llava_onevision(questions: list[str], modality: str):
|
||||
def run_llava_onevision(questions: list[str],
|
||||
modality: str) -> ModelRequestData:
|
||||
|
||||
if modality == "video":
|
||||
prompts = [
|
||||
@ -305,15 +387,20 @@ def run_llava_onevision(questions: list[str], modality: str):
|
||||
<|im_start|>assistant\n" for question in questions
|
||||
]
|
||||
|
||||
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
|
||||
max_model_len=16384,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
|
||||
max_model_len=16384,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Mantis
|
||||
def run_mantis(questions: list[str], modality: str):
|
||||
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
|
||||
@ -322,14 +409,19 @@ def run_mantis(questions: list[str], modality: str):
|
||||
for question in questions
|
||||
]
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
max_model_len=4096,
|
||||
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
stop_token_ids = [128009]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# MiniCPM-V
|
||||
@ -357,7 +449,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
|
||||
# model_name = "openbmb/MiniCPM-o-2_6"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -389,19 +481,24 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
|
||||
tokenize=False,
|
||||
add_generation_prompt=True) for question in questions
|
||||
]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
def run_minicpmo(questions: list[str], modality: str):
|
||||
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
|
||||
|
||||
|
||||
def run_minicpmv(questions: list[str], modality: str):
|
||||
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
|
||||
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
|
||||
|
||||
|
||||
# LLama 3.2
|
||||
def run_mllama(questions: list[str], modality: str):
|
||||
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
@ -411,7 +508,7 @@ def run_mllama(questions: list[str], modality: str):
|
||||
# You may lower either to run this example on lower-end GPUs.
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=16,
|
||||
@ -432,17 +529,20 @@ def run_mllama(questions: list[str], modality: str):
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=False)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Molmo
|
||||
def run_molmo(questions: list[str], modality: str):
|
||||
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "allenai/Molmo-7B-D-0924"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
@ -453,18 +553,21 @@ def run_molmo(questions: list[str], modality: str):
|
||||
f"<|im_start|>user <image>\n{question}<|im_end|> \
|
||||
<|im_start|>assistant\n" for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# NVLM-D
|
||||
def run_nvlm_d(questions: list[str], modality: str):
|
||||
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -481,36 +584,47 @@ def run_nvlm_d(questions: list[str], modality: str):
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# PaliGemma
|
||||
def run_paligemma(question: str, modality: str):
|
||||
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
# PaliGemma has special prompt format for VQA
|
||||
prompt = ["caption en"]
|
||||
llm = LLM(model="google/paligemma-3b-mix-224",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
prompts = ["caption en" for _ in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="google/paligemma-3b-mix-224",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# PaliGemma 2
|
||||
def run_paligemma2(question: str, modality: str):
|
||||
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
# PaliGemma 2 has special prompt format for VQA
|
||||
prompt = ["caption en"]
|
||||
llm = LLM(model="google/paligemma2-3b-ft-docci-448",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
prompts = ["caption en" for _ in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="google/paligemma2-3b-ft-docci-448",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Phi-3-Vision
|
||||
def run_phi3v(questions: list[str], modality: str):
|
||||
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [
|
||||
@ -530,7 +644,7 @@ def run_phi3v(questions: list[str], modality: str):
|
||||
#
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -539,12 +653,15 @@ def run_phi3v(questions: list[str], modality: str):
|
||||
mm_processor_kwargs={"num_crops": 16},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Phi-4-multimodal-instruct
|
||||
def run_phi4mm(questions: list[str], modality: str):
|
||||
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process image inputs.
|
||||
@ -558,7 +675,7 @@ def run_phi4mm(questions: list[str], modality: str):
|
||||
f"<|user|><|image_1|>{question}<|end|><|assistant|>"
|
||||
for question in questions
|
||||
]
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -567,24 +684,22 @@ def run_phi4mm(questions: list[str], modality: str):
|
||||
max_lora_rank=320,
|
||||
lora_extra_vocab_size=0,
|
||||
)
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# Pixtral HF-format
|
||||
def run_pixtral_hf(questions: list[str], modality: str):
|
||||
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -592,15 +707,18 @@ def run_pixtral_hf(questions: list[str], modality: str):
|
||||
)
|
||||
|
||||
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Qwen
|
||||
def run_qwen_vl(questions: list[str], modality: str):
|
||||
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen-VL",
|
||||
trust_remote_code=True,
|
||||
max_model_len=1024,
|
||||
@ -610,16 +728,19 @@ def run_qwen_vl(questions: list[str], modality: str):
|
||||
)
|
||||
|
||||
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Qwen2-VL
|
||||
def run_qwen2_vl(questions: list[str], modality: str):
|
||||
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
|
||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
@ -642,16 +763,19 @@ def run_qwen2_vl(questions: list[str], modality: str):
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n") for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Qwen2.5-VL
|
||||
def run_qwen2_5_vl(questions: list[str], modality: str):
|
||||
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
|
||||
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
@ -674,8 +798,11 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n") for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -789,18 +916,28 @@ def main(args):
|
||||
data = mm_input["data"]
|
||||
questions = mm_input["questions"]
|
||||
|
||||
llm, prompts, stop_token_ids = model_example_map[model](questions,
|
||||
modality)
|
||||
req_data = model_example_map[model](questions, modality)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
# Don't want to check the flag multiple times, so just hijack `prompts`.
|
||||
prompts = prompts if args.use_different_prompt_per_request else [
|
||||
prompts[0]
|
||||
prompts = req_data.prompts if args.use_different_prompt_per_request else [
|
||||
req_data.prompts[0]
|
||||
]
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=64,
|
||||
stop_token_ids=stop_token_ids)
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
assert args.num_prompts > 0
|
||||
if args.num_prompts == 1:
|
||||
@ -865,6 +1002,10 @@ if __name__ == "__main__":
|
||||
type=int,
|
||||
default=16,
|
||||
help='Number of frames to extract from the video.')
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
parser.add_argument(
|
||||
'--image-repeat-prob',
|
||||
|
@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
llm: LLM
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
image: Optional[Image]
|
||||
|
||||
|
||||
def run_e5_v(query: Query):
|
||||
def run_e5_v(query: Query) -> ModelRequestData:
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
||||
|
||||
if query["modality"] == "text":
|
||||
@ -58,20 +59,20 @@ def run_e5_v(query: Query):
|
||||
modality = query['modality']
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="royokong/e5-v",
|
||||
task="embed",
|
||||
max_model_len=4096,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec(query: Query):
|
||||
def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||
@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
|
||||
modality = query['modality']
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
task="embed",
|
||||
trust_remote_code=True,
|
||||
@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def run_encode(model: str, modality: QueryModality):
|
||||
def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
mm_data = {}
|
||||
if req_data.image is not None:
|
||||
mm_data["image"] = req_data.image
|
||||
|
||||
outputs = req_data.llm.embed({
|
||||
outputs = llm.embed({
|
||||
"prompt": req_data.prompt,
|
||||
"multi_modal_data": mm_data,
|
||||
})
|
||||
@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
run_encode(args.model_name, args.modality)
|
||||
run_encode(args.model_name, args.modality, args.seed)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -167,5 +171,10 @@ if __name__ == "__main__":
|
||||
default="image",
|
||||
choices=get_args(QueryModality),
|
||||
help='Modality of the input.')
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -6,13 +6,14 @@ using the chat template defined by the model.
|
||||
"""
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoProcessor, AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
@ -25,11 +26,12 @@ IMAGE_URLS = [
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
llm: LLM
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
stop_token_ids: Optional[list[int]]
|
||||
image_data: list[Image]
|
||||
chat_template: Optional[str]
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
chat_template: Optional[str] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
@ -37,53 +39,55 @@ class ModelRequestData(NamedTuple):
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
def load_aria(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "rhymes-ai/Aria"
|
||||
llm = LLM(model=model_name,
|
||||
tokenizer_mode="slow",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"image": len(image_urls)})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
tokenizer_mode="slow",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
|
||||
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_deepseek_vl2(question: str, image_urls: list[str]):
|
||||
def load_deepseek_vl2(question: str,
|
||||
image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||
limit_mm_per_prompt={"image": len(image_urls)})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholder = "".join(f"image_{i}:<image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -112,18 +116,16 @@ def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
add_generation_prompt=True)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "h2oai/h2ovl-mississippi-800m"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
@ -146,19 +148,18 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=16,
|
||||
@ -177,18 +178,16 @@ def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -214,19 +213,18 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=16,
|
||||
@ -236,19 +234,17 @@ def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
|
||||
placeholders = "<|image|>" * len(image_urls)
|
||||
prompt = f"{placeholders}<|begin_of_text|>{question}"
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_nvlm_d(question: str, image_urls: list[str]):
|
||||
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
@ -266,14 +262,11 @@ def load_nvlm_d(question: str, image_urls: list[str]):
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -281,7 +274,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -291,14 +284,11 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
|
||||
placeholders = "[IMG]" * len(image_urls)
|
||||
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -315,7 +305,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
#
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -326,14 +316,11 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
placeholders = "\n".join(f"<|image_{i}|>"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -347,7 +334,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=10000,
|
||||
@ -357,30 +344,23 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
max_lora_rank=320,
|
||||
lora_extra_vocab_size=0,
|
||||
)
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
|
||||
placeholders = "".join(f"<|image_{i}|>"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
def load_qwen_vl_chat(question: str,
|
||||
image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen-VL-Chat"
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=1024,
|
||||
@ -411,7 +391,7 @@ def load_qwen_vl_chat(question: str,
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
@ -419,7 +399,7 @@ def load_qwen_vl_chat(question: str,
|
||||
)
|
||||
|
||||
|
||||
def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
try:
|
||||
from qwen_vl_utils import process_vision_info
|
||||
except ModuleNotFoundError:
|
||||
@ -431,7 +411,7 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
|
||||
# Tested on L40
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=32768 if process_vision_info is None else 4096,
|
||||
max_num_seqs=5,
|
||||
@ -460,23 +440,19 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
stop_token_ids = None
|
||||
|
||||
if process_vision_info is None:
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
else:
|
||||
image_data, _ = process_vision_info(messages)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=image_data,
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
try:
|
||||
from qwen_vl_utils import process_vision_info
|
||||
except ModuleNotFoundError:
|
||||
@ -487,7 +463,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
|
||||
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=32768 if process_vision_info is None else 4096,
|
||||
max_num_seqs=5,
|
||||
@ -516,8 +492,6 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
stop_token_ids = None
|
||||
|
||||
if process_vision_info is None:
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
else:
|
||||
@ -525,11 +499,9 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
return_video_kwargs=False)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=image_data,
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -551,14 +523,25 @@ model_example_map = {
|
||||
}
|
||||
|
||||
|
||||
def run_generate(model, question: str, image_urls: list[str]):
|
||||
def run_generate(model, question: str, image_urls: list[str],
|
||||
seed: Optional[int]):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
outputs = req_data.llm.generate(
|
||||
outputs = llm.generate(
|
||||
{
|
||||
"prompt": req_data.prompt,
|
||||
"multi_modal_data": {
|
||||
@ -572,13 +555,24 @@ def run_generate(model, question: str, image_urls: list[str]):
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: list[str]):
|
||||
def run_chat(model: str, question: str, image_urls: list[str],
|
||||
seed: Optional[int]):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
outputs = req_data.llm.chat(
|
||||
outputs = llm.chat(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -607,11 +601,12 @@ def run_chat(model: str, question: str, image_urls: list[str]):
|
||||
def main(args: Namespace):
|
||||
model = args.model_type
|
||||
method = args.method
|
||||
seed = args.seed
|
||||
|
||||
if method == "generate":
|
||||
run_generate(model, QUESTION, IMAGE_URLS)
|
||||
run_generate(model, QUESTION, IMAGE_URLS, seed)
|
||||
elif method == "chat":
|
||||
run_chat(model, QUESTION, IMAGE_URLS)
|
||||
run_chat(model, QUESTION, IMAGE_URLS, seed)
|
||||
else:
|
||||
raise ValueError(f"Invalid method: {method}")
|
||||
|
||||
@ -632,6 +627,10 @@ if __name__ == "__main__":
|
||||
default="generate",
|
||||
choices=["generate", "chat"],
|
||||
help="The method to run in `vllm.LLM`.")
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -20,7 +20,7 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer >= 0.10.11, < 0.11
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
@ -28,7 +28,7 @@ pyzmq
|
||||
msgspec
|
||||
gguf == 0.10.0
|
||||
importlib_metadata
|
||||
mistral_common[opencv] >= 1.5.0
|
||||
mistral_common[opencv] >= 1.5.4
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
|
@ -15,7 +15,7 @@ pydantic >= 2.8
|
||||
torch
|
||||
py-cpuinfo
|
||||
transformers
|
||||
mistral_common >= 1.5.0
|
||||
mistral_common >= 1.5.4
|
||||
aiohttp
|
||||
starlette
|
||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||
|
@ -27,7 +27,7 @@ torchaudio==2.6.0
|
||||
torchvision==0.21.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.5.0 # required for pixtral test
|
||||
mistral_common[opencv] >= 1.5.4 # required for pixtral test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.4 # required for model evaluation test
|
||||
transformers==4.48.2
|
||||
@ -40,4 +40,4 @@ tritonclient==2.51.0
|
||||
|
||||
numpy < 2.0.0
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
|
@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common==1.5.1
|
||||
mistral-common==1.5.4
|
||||
# via -r requirements/test.in
|
||||
more-itertools==10.5.0
|
||||
# via lm-eval
|
||||
|
@ -17,9 +17,9 @@ ray[data]
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
|
30
setup.py
30
setup.py
@ -294,26 +294,28 @@ class repackage_wheel(build_ext):
|
||||
]).decode("utf-8")
|
||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||
|
||||
# Check if the local main branch is up-to-date. This is to ensure
|
||||
# the base commit we found is the most recent commit on the main
|
||||
# branch.
|
||||
local_main_commit = subprocess.check_output(
|
||||
["git", "rev-parse", "main"]).decode("utf-8").strip()
|
||||
if local_main_commit != upstream_main_commit:
|
||||
raise ValueError(
|
||||
f"Local main branch ({local_main_commit}) is not "
|
||||
"up-to-date with upstream main branch "
|
||||
f"({upstream_main_commit}). Please pull the latest "
|
||||
"changes from upstream main branch first.")
|
||||
# Check if the upstream_main_commit exists in the local repo
|
||||
try:
|
||||
subprocess.check_output(
|
||||
["git", "cat-file", "-e", f"{upstream_main_commit}"])
|
||||
except subprocess.CalledProcessError:
|
||||
# If not present, fetch it from the remote repository.
|
||||
# Note that this does not update any local branches,
|
||||
# but ensures that this commit ref and its history are
|
||||
# available in our local repo.
|
||||
subprocess.check_call([
|
||||
"git", "fetch", "https://github.com/vllm-project/vllm",
|
||||
"main"
|
||||
])
|
||||
|
||||
# Then get the commit hash of the current branch that is the same as
|
||||
# the upstream main commit.
|
||||
current_branch = subprocess.check_output(
|
||||
["git", "branch", "--show-current"]).decode("utf-8").strip()
|
||||
|
||||
base_commit = subprocess.check_output(
|
||||
["git", "merge-base", "main",
|
||||
current_branch]).decode("utf-8").strip()
|
||||
base_commit = subprocess.check_output([
|
||||
"git", "merge-base", f"{upstream_main_commit}", current_branch
|
||||
]).decode("utf-8").strip()
|
||||
return base_commit
|
||||
except ValueError as err:
|
||||
raise ValueError(err) from None
|
||||
|
@ -7,10 +7,10 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_python_error():
|
||||
"""
|
||||
Test if Python error occurs when there's low-level
|
||||
@ -36,7 +36,7 @@ def test_python_error():
|
||||
allocator.wake_up()
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_basic_cumem():
|
||||
# some tensors from default memory pool
|
||||
shape = (1024, 1024)
|
||||
@ -69,7 +69,7 @@ def test_basic_cumem():
|
||||
assert torch.allclose(output, torch.ones_like(output) * 3)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_cumem_with_cudagraph():
|
||||
allocator = CuMemAllocator.get_instance()
|
||||
with allocator.use_memory_pool():
|
||||
@ -114,7 +114,7 @@ def test_cumem_with_cudagraph():
|
||||
assert torch.allclose(y, x + 1)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize(
|
||||
"model, use_v1",
|
||||
[
|
||||
|
@ -12,7 +12,7 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.fixture(params=None, name="model_info")
|
||||
@ -78,7 +78,7 @@ def models_list_fixture(request):
|
||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
||||
)
|
||||
@pytest.mark.parametrize("model_info", "", indirect=True)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_full_graph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
|
@ -8,7 +8,7 @@ import pytest
|
||||
from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_expert_parallel")
|
||||
|
||||
@ -209,7 +209,7 @@ def _compare_tp(
|
||||
for params in settings.iter_params(model_name)
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ep(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
|
@ -17,7 +17,7 @@ from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_pipeline_parallel")
|
||||
|
||||
@ -402,7 +402,7 @@ def _compare_tp(
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_language_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
@ -431,7 +431,7 @@ def test_tp_language_generation(
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_language_embedding(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
@ -460,7 +460,7 @@ def test_tp_language_embedding(
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_multimodal_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
|
@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import LiteralString
|
||||
@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
||||
"FLASH_ATTN",
|
||||
"FLASHINFER",
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_pp_cudagraph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
PP_SIZE: int,
|
||||
|
@ -4,12 +4,12 @@ import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...utils import fork_new_process_for_each_test
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("backend", ["mp", "ray"])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_collective_rpc(tp_size, backend):
|
||||
if tp_size == 1 and backend == "ray":
|
||||
pytest.skip("Skip duplicate test case")
|
||||
|
@ -3,10 +3,9 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||
|
||||
@ -55,7 +54,7 @@ def v1(run_with_both_engines_lora):
|
||||
pass
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -75,7 +74,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -96,7 +95,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
|
@ -4,10 +4,9 @@ import pytest
|
||||
import ray
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
@ -82,7 +81,7 @@ def v1(run_with_both_engines_lora):
|
||||
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
@ -97,7 +96,7 @@ def test_llama_lora(sql_lora_files):
|
||||
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
|
||||
# used by the engine yet.
|
||||
@pytest.mark.skip_v1
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_warmup(sql_lora_files):
|
||||
"""Test that the LLM initialization works with a warmup LORA path and
|
||||
is more conservative"""
|
||||
@ -128,7 +127,7 @@ def test_llama_lora_warmup(sql_lora_files):
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
@ -143,7 +142,7 @@ def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
@ -159,7 +158,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
|
@ -3,11 +3,12 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
|
||||
PROMPT_TEMPLATE = (
|
||||
@ -57,7 +58,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_lora(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -80,7 +81,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -101,7 +102,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
|
@ -3,10 +3,9 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "ArthurZ/ilama-3.2-1B"
|
||||
|
||||
@ -56,7 +55,7 @@ def v1(run_with_both_engines_lora):
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -77,7 +76,7 @@ def test_ilama_lora(ilama_lora_files):
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -99,7 +98,7 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
|
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import json
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
@ -208,8 +209,6 @@ def test_guided_decoding_backend_options():
|
||||
|
||||
|
||||
def test_pickle_xgrammar_tokenizer_data():
|
||||
|
||||
# TODO: move to another test file for xgrammar
|
||||
try:
|
||||
import xgrammar as xgr
|
||||
except ImportError:
|
||||
@ -217,7 +216,11 @@ def test_pickle_xgrammar_tokenizer_data():
|
||||
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||
TokenizerData)
|
||||
tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
|
||||
tokenizer_data = TokenizerData(
|
||||
metadata=
|
||||
'{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
|
||||
encoded_vocab=['!', '"', '#', '$', '%'],
|
||||
)
|
||||
pickled = pickle.dumps(tokenizer_data)
|
||||
|
||||
assert pickled is not None
|
||||
@ -225,4 +228,5 @@ def test_pickle_xgrammar_tokenizer_data():
|
||||
depickled: TokenizerData = pickle.loads(pickled)
|
||||
|
||||
assert depickled is not None
|
||||
assert depickled.vocab_type == xgr.VocabType.RAW
|
||||
assert json.loads(
|
||||
depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
|
||||
|
@ -17,7 +17,7 @@ from vllm.utils import identity
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
||||
_VideoAssets)
|
||||
from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
|
||||
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
|
||||
multi_gpu_marks)
|
||||
from ...utils import check_outputs_equal
|
||||
from .vlm_utils import custom_inputs, model_utils, runners
|
||||
@ -592,7 +592,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@ -617,7 +617,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@ -642,7 +642,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_image_embedding_models(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@ -666,7 +666,7 @@ def test_image_embedding_models(model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
@ -688,7 +688,7 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_custom_inputs_models(
|
||||
model_type: str,
|
||||
@ -714,9 +714,9 @@ def test_custom_inputs_models(
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -740,9 +740,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -766,9 +766,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_image_embedding_models_heavy(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -791,7 +791,7 @@ def test_image_embedding_models_heavy(model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -814,9 +814,9 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_custom_inputs_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
|
@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||
|
||||
|
||||
def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
fork_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
|
||||
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
@ -43,7 +43,7 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
||||
|
||||
# Everything looks okay; keep if this is has correct proc handling
|
||||
if (test_info.distributed_executor_backend
|
||||
is not None) == fork_per_test:
|
||||
is not None) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
@ -51,14 +51,14 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
||||
|
||||
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
fork_new_process_for_each_test: bool):
|
||||
create_new_process_for_each_test: bool):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, fork_new_process_for_each_test)
|
||||
test_settings, test_type, create_new_process_for_each_test)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||
|
@ -10,7 +10,7 @@ import pytest
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ....utils import fork_new_process_for_each_test, multi_gpu_test
|
||||
from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
PROMPTS = [
|
||||
{
|
||||
@ -119,7 +119,7 @@ def run_test(
|
||||
assert output.outputs[0].text == expected
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize(
|
||||
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
|
||||
|
@ -5,10 +5,10 @@ import pytest
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_plugin(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_opt_path: str,
|
||||
@ -24,7 +24,7 @@ def test_plugin(
|
||||
assert (error_msg in str(excinfo.value))
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_text_generation(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_opt_path: str,
|
||||
@ -44,7 +44,7 @@ def test_oot_registration_text_generation(
|
||||
assert rest == ""
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_embedding(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_gemma2_embedding_path: str,
|
||||
@ -62,7 +62,7 @@ def test_oot_registration_embedding(
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_multimodal(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_llava_path: str,
|
||||
|
@ -17,7 +17,7 @@ from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
|
||||
ModelRegistry)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
|
||||
assert supports_multimodal(model_cls)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
|
||||
("LlamaForCausalLM", False, False, False),
|
||||
("MllamaForConditionalGeneration", True, False, False),
|
||||
@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
|
||||
stacklevel=2)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
|
||||
("MLPSpeculatorPreTrainedModel", False, False),
|
||||
("DeepseekV2ForCausalLM", True, False),
|
||||
|
@ -10,7 +10,8 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from tests.utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
models_4bit_to_test = [
|
||||
("facebook/opt-125m", "quantize opt model inflight"),
|
||||
@ -32,7 +33,7 @@ models_pre_quant_8bit_to_test = [
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -45,7 +46,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_qaunt_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -57,7 +58,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_quant_8bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -70,7 +71,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -88,7 +89,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
common_args = [
|
||||
"--disable-log-stats",
|
||||
|
@ -42,7 +42,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
from ...utils import fork_new_process_for_each_test
|
||||
from ...utils import create_new_process_for_each_test
|
||||
from .conftest import (get_output_from_llm_generator,
|
||||
run_equality_correctness_test)
|
||||
|
||||
@ -82,7 +82,7 @@ from .conftest import (get_output_from_llm_generator,
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||
batch_size: int):
|
||||
"""Run generation with speculative decoding on a batch. Verify the engine
|
||||
@ -170,7 +170,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [1])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -244,7 +244,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [64])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -300,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [32])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||
@ -356,7 +356,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||
256,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -411,7 +411,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||
64,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -469,7 +469,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -534,7 +534,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||
32,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs,
|
||||
@ -594,7 +594,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||
64,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||
test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -644,7 +644,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("output_len", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||
test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -697,7 +697,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||
32,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||
output_len: int, seed: int):
|
||||
@ -752,7 +752,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
32,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs,
|
||||
|
@ -16,7 +16,7 @@ from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
|
||||
deprecate_kwargs, get_open_port, memory_profiling,
|
||||
merge_async_iterators, supports_kw, swap_dict_values)
|
||||
|
||||
from .utils import error_on_warning, fork_new_process_for_each_test
|
||||
from .utils import create_new_process_for_each_test, error_on_warning
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -276,7 +276,7 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
|
||||
) == is_supported
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_memory_profiling():
|
||||
# Fake out some model loading + inference memory usage to test profiling
|
||||
# Memory used by other processes will show up as cuda usage outside of torch
|
||||
|
@ -7,12 +7,14 @@ import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from contextlib import contextmanager, suppress
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional, Union
|
||||
from typing import Any, Callable, Literal, Optional, Union
|
||||
|
||||
import cloudpickle
|
||||
import openai
|
||||
import pytest
|
||||
import requests
|
||||
@ -703,6 +705,78 @@ def fork_new_process_for_each_test(
|
||||
return wrapper
|
||||
|
||||
|
||||
def spawn_new_process_for_each_test(
|
||||
f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
"""Decorator to spawn a new process for each test function.
|
||||
"""
|
||||
|
||||
@functools.wraps(f)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
||||
# Check if we're already in a subprocess
|
||||
if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
|
||||
# If we are, just run the function directly
|
||||
return f(*args, **kwargs)
|
||||
|
||||
import torch.multiprocessing as mp
|
||||
with suppress(RuntimeError):
|
||||
mp.set_start_method('spawn')
|
||||
|
||||
# Get the module
|
||||
module_name = f.__module__
|
||||
|
||||
# Create a process with environment variable set
|
||||
env = os.environ.copy()
|
||||
env['RUNNING_IN_SUBPROCESS'] = '1'
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
output_filepath = os.path.join(tempdir, "new_process.tmp")
|
||||
|
||||
# `cloudpickle` allows pickling complex functions directly
|
||||
input_bytes = cloudpickle.dumps((f, output_filepath))
|
||||
|
||||
cmd = [sys.executable, "-m", f"{module_name}"]
|
||||
|
||||
returned = subprocess.run(cmd,
|
||||
input=input_bytes,
|
||||
capture_output=True,
|
||||
env=env)
|
||||
|
||||
# check if the subprocess is successful
|
||||
try:
|
||||
returned.check_returncode()
|
||||
except Exception as e:
|
||||
# wrap raised exception to provide more information
|
||||
raise RuntimeError(f"Error raised in subprocess:\n"
|
||||
f"{returned.stderr.decode()}") from e
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def create_new_process_for_each_test(
|
||||
method: Optional[Literal["spawn", "fork"]] = None
|
||||
) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
|
||||
"""Creates a decorator that runs each test function in a new process.
|
||||
|
||||
Args:
|
||||
method: The process creation method. Can be either "spawn" or "fork".
|
||||
If not specified,
|
||||
it defaults to "spawn" on ROCm platforms and "fork" otherwise.
|
||||
|
||||
Returns:
|
||||
A decorator to run test functions in separate processes.
|
||||
"""
|
||||
if method is None:
|
||||
method = "spawn" if current_platform.is_rocm() else "fork"
|
||||
|
||||
assert method in ["spawn",
|
||||
"fork"], "Method must be either 'spawn' or 'fork'"
|
||||
|
||||
if method == "fork":
|
||||
return fork_new_process_for_each_test
|
||||
|
||||
return spawn_new_process_for_each_test
|
||||
|
||||
|
||||
def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||
"""
|
||||
Get a pytest mark, which skips the test if the GPU doesn't meet
|
||||
@ -762,7 +836,7 @@ def multi_gpu_test(*, num_gpus: int):
|
||||
marks = multi_gpu_marks(num_gpus=num_gpus)
|
||||
|
||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
func = fork_new_process_for_each_test(f)
|
||||
func = create_new_process_for_each_test()(f)
|
||||
for mark in reversed(marks):
|
||||
func = mark(func)
|
||||
|
||||
|
@ -9,7 +9,6 @@ from concurrent.futures import Future
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
@ -19,6 +18,8 @@ from vllm.v1.executor.abstract import Executor, UniProcExecutor
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||
allow_module_level=True)
|
||||
@ -44,7 +45,7 @@ def make_request() -> EngineCoreRequest:
|
||||
)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
@ -158,7 +159,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
A basic end-to-end test to verify that the engine functions correctly
|
||||
@ -208,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||
_check_engine_state()
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that the engine can handle multiple concurrent batches.
|
||||
|
@ -8,7 +8,6 @@ from typing import Optional
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
@ -19,6 +18,8 @@ from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
|
||||
SyncMPClient)
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||
allow_module_level=True)
|
||||
@ -88,7 +89,7 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
|
||||
return msg
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
|
||||
def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
|
||||
multiprocessing_mode: bool):
|
||||
|
@ -18,9 +18,6 @@ MODELS_TO_TEST = [
|
||||
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
|
||||
]
|
||||
|
||||
# Undo after https://github.com/vllm-project/vllm/pull/14868
|
||||
pytest.skip(allow_module_level=True)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
|
@ -1191,7 +1191,7 @@ class EngineArgs:
|
||||
NOTE: for autoselection of V0 vs V1 engine, we need to
|
||||
create the ModelConfig first, since ModelConfig's attrs
|
||||
(e.g. the model arch) are needed to make the decision.
|
||||
|
||||
|
||||
This function set VLLM_USE_V1=X if VLLM_USE_V1 is
|
||||
unspecified by the user.
|
||||
|
||||
@ -1576,10 +1576,6 @@ class EngineArgs:
|
||||
#############################################################
|
||||
# Experimental Features - allow users to opt in.
|
||||
|
||||
# MLA is is supported on V1, but off by default for now.
|
||||
if model_config.use_mla and _warn_or_fallback("MLA"):
|
||||
return False
|
||||
|
||||
# LoRA is supported on V1, but off by default for now.
|
||||
if self.enable_lora and _warn_or_fallback("LORA"):
|
||||
return False
|
||||
|
@ -379,6 +379,7 @@ class InputPreprocessor:
|
||||
multi_modal_data,
|
||||
mm_processor_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
@ -401,6 +402,7 @@ class InputPreprocessor:
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
"""Async version of :meth:`_extract_prompt_components`."""
|
||||
parsed = parse_singleton_prompt(prompt)
|
||||
@ -431,6 +433,7 @@ class InputPreprocessor:
|
||||
multi_modal_data,
|
||||
mm_processor_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return token_inputs(
|
||||
@ -452,6 +455,7 @@ class InputPreprocessor:
|
||||
multi_modal_data,
|
||||
mm_processor_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
@ -726,6 +730,7 @@ class InputPreprocessor:
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
@ -746,6 +751,7 @@ class InputPreprocessor:
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
|
@ -9,7 +9,6 @@ from vllm.model_executor.guided_decoding.reasoner import get_reasoner
|
||||
from vllm.model_executor.guided_decoding.utils import (
|
||||
convert_lark_to_gbnf, grammar_is_likely_lark,
|
||||
has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
|
||||
from vllm.platforms import CpuArchEnum
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer
|
||||
@ -26,7 +25,7 @@ def maybe_backend_fallback(
|
||||
|
||||
def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
|
||||
fallback: str) -> None:
|
||||
"""Change the backend to the specified fallback with a warning log,
|
||||
"""Change the backend to the specified fallback with a warning log,
|
||||
or raise a ValueError if the `no-fallback` option is specified."""
|
||||
if guided_params.no_fallback():
|
||||
raise ValueError(message)
|
||||
@ -53,19 +52,12 @@ def maybe_backend_fallback(
|
||||
if guided_params.backend_name == "xgrammar":
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||
xgr_installed)
|
||||
# xgrammar only has x86 wheels for linux, fallback to outlines
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
|
||||
fallback_or_error(guided_params,
|
||||
"xgrammar is only supported on x86 CPUs.",
|
||||
"outlines")
|
||||
|
||||
# xgrammar doesn't support regex, fallback to outlines
|
||||
if guided_params.regex is not None:
|
||||
fallback_or_error(
|
||||
guided_params,
|
||||
"xgrammar does not support regex guided decoding.", "outlines")
|
||||
|
||||
# xgrammar doesn't support some JSON schema features
|
||||
elif (guided_params.json is not None
|
||||
and has_xgrammar_unsupported_json_features(guided_params.json)):
|
||||
|
@ -9,13 +9,11 @@ from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any, List
|
||||
|
||||
import torch
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
try:
|
||||
import xgrammar as xgr
|
||||
from xgrammar.base import _core as xgr_core
|
||||
xgr_installed = True
|
||||
except ImportError:
|
||||
xgr_installed = False
|
||||
@ -35,7 +33,6 @@ if TYPE_CHECKING:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
# TODO: passing batch size to max threads here
|
||||
def get_local_xgrammar_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
@ -52,18 +49,8 @@ def get_local_xgrammar_guided_decoding_logits_processor(
|
||||
@dataclass(frozen=True)
|
||||
class TokenizerData:
|
||||
"""Immutable container for cached tokenizer data."""
|
||||
metadata: str
|
||||
encoded_vocab: list[str] = field(default_factory=list)
|
||||
stop_token_ids: list[int] | None = None
|
||||
# These fields are mutually exclusive: `backend_str` is used to create a
|
||||
# TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
|
||||
# used within the constructor of TokenizeInfo
|
||||
backend_str: str | None = None
|
||||
vocab_type: xgr.VocabType | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
# Check for mutual exclusive
|
||||
assert not (self.backend_str and self.vocab_type), \
|
||||
"backend_str and vocab_type are mutual exclusive"
|
||||
|
||||
|
||||
class TokenizerDataCache:
|
||||
@ -71,46 +58,52 @@ class TokenizerDataCache:
|
||||
_cache: dict[int, TokenizerData] = {}
|
||||
|
||||
@classmethod
|
||||
def get_tokenizer_data(cls,
|
||||
tokenizer: PreTrainedTokenizer) -> TokenizerData:
|
||||
tokenizer_hash = hash(tokenizer)
|
||||
def get_tokenizer_data(
|
||||
cls,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
/,
|
||||
*,
|
||||
tokenizer_hash: int,
|
||||
vocab_size: int,
|
||||
) -> TokenizerData:
|
||||
|
||||
if tokenizer_hash not in cls._cache:
|
||||
# Vendored from xgrammar logic since we cannot pickle the tokenizer
|
||||
# https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
|
||||
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
|
||||
tokenizer,
|
||||
# NOTE: We will need to use lm_head's vocab_size
|
||||
# to determine correct special_token_ids for this tokenizer.
|
||||
# See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92 # noqa: E501
|
||||
vocab_size=vocab_size,
|
||||
)
|
||||
metadata = json.loads(tokenizer_info.dump_metadata())
|
||||
|
||||
# Vendored from xgrammar logic to get encoded_vocab
|
||||
# https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
|
||||
try:
|
||||
encoded_vocab = [
|
||||
token for token, _ in sorted(tokenizer.get_vocab().items(),
|
||||
key=lambda x: x[1])
|
||||
]
|
||||
vocab_dict = tokenizer.get_vocab()
|
||||
except AttributeError as e:
|
||||
raise ValueError(
|
||||
f"Cannot get the vocabulary of the tokenizer "
|
||||
f"{type(tokenizer)}. The tokenizer should have a "
|
||||
"get_vocab method.") from e
|
||||
|
||||
stop_token_ids = None
|
||||
backend_str = ""
|
||||
vocab_type = xgr.VocabType.RAW
|
||||
# maintain tokenizer's indexing
|
||||
encoded_vocab = [""] * tokenizer_info.vocab_size
|
||||
for token, idx in vocab_dict.items():
|
||||
if idx < tokenizer_info.vocab_size:
|
||||
encoded_vocab[idx] = token
|
||||
|
||||
if stop_token_ids is None and hasattr(
|
||||
tokenizer,
|
||||
"eos_token_id") and tokenizer.eos_token_id is not None:
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
|
||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
backend_str = tokenizer.backend_tokenizer.to_str()
|
||||
vocab_type = None
|
||||
|
||||
elif isinstance(tokenizer, MistralTokenizer):
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
# REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
|
||||
vocab_type = xgr.VocabType.BYTE_FALLBACK
|
||||
metadata.update({
|
||||
"vocab_type": xgr.VocabType.BYTE_FALLBACK,
|
||||
"add_prefix_space": True
|
||||
})
|
||||
|
||||
cls._cache[tokenizer_hash] = TokenizerData(
|
||||
encoded_vocab=encoded_vocab,
|
||||
stop_token_ids=stop_token_ids,
|
||||
backend_str=backend_str,
|
||||
vocab_type=vocab_type)
|
||||
metadata=json.dumps(metadata),
|
||||
)
|
||||
|
||||
return cls._cache[tokenizer_hash]
|
||||
|
||||
@ -129,30 +122,15 @@ class GrammarCompilerCache:
|
||||
cache_key = str(config.tokenizer_hash)
|
||||
|
||||
if cache_key not in cls._cache:
|
||||
assert config.tokenizer_data is not None
|
||||
assert config.tokenizer_data.encoded_vocab is not None
|
||||
|
||||
config_data = config.tokenizer_data
|
||||
|
||||
# In TokenizerDataCache.get_tokenizer_data, a serializable
|
||||
# tokenizer_data is created and cached. This data is used to build
|
||||
# a tokenizer_info and create an xgrammar compiler.
|
||||
# - If tokenizer_data has backend_str set, use
|
||||
# xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
|
||||
# - Otherwise, use the default constructor with vocab_type.
|
||||
# - xgr_core.TokenizerInfo.from_huggingface !=
|
||||
# xgr.TokenizerInfo.from_huggingface.
|
||||
if config_data.backend_str:
|
||||
tokenizer_info = xgr.TokenizerInfo._create_from_handle(
|
||||
xgr_core.TokenizerInfo.from_huggingface(
|
||||
config_data.encoded_vocab, config_data.backend_str,
|
||||
config.vocab_size, config_data.stop_token_ids))
|
||||
else:
|
||||
tokenizer_info = xgr.TokenizerInfo(
|
||||
config_data.encoded_vocab,
|
||||
config_data.vocab_type,
|
||||
vocab_size=config.vocab_size,
|
||||
stop_token_ids=config_data.stop_token_ids)
|
||||
tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||
encoded_vocab=config_data.encoded_vocab,
|
||||
metadata=config_data.metadata,
|
||||
)
|
||||
cls._cache[cache_key] = xgr.GrammarCompiler(
|
||||
tokenizer_info, max_threads=config.max_threads)
|
||||
|
||||
@ -163,13 +141,12 @@ class GrammarCompilerCache:
|
||||
class GrammarConfig:
|
||||
"""Serializable configuration for grammar compilation"""
|
||||
tokenizer_hash: int
|
||||
vocab_size: int
|
||||
tokenizer_data: TokenizerData
|
||||
json_str: str | None = None
|
||||
grammar_str: str | None = None
|
||||
json_object: bool | None = None
|
||||
any_whitespace: bool = True
|
||||
max_threads: int = 8
|
||||
tokenizer_data: TokenizerData | None = None
|
||||
|
||||
@classmethod
|
||||
def from_guided_params(cls,
|
||||
@ -179,7 +156,11 @@ class GrammarConfig:
|
||||
max_threads: int = 8) -> GrammarConfig:
|
||||
|
||||
tokenizer_hash = hash(tokenizer)
|
||||
tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
|
||||
tokenizer_data = TokenizerDataCache.get_tokenizer_data(
|
||||
tokenizer,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
)
|
||||
|
||||
if guided_params.json:
|
||||
if not isinstance(guided_params.json, str):
|
||||
@ -218,7 +199,6 @@ class GrammarConfig:
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return cls(json_str=json_str,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
@ -246,14 +226,12 @@ class GrammarConfig:
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return cls(grammar_str=grammar_str,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data)
|
||||
elif guided_params.json_object:
|
||||
return cls(
|
||||
json_object=True,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
@ -267,7 +245,6 @@ class GrammarConfig:
|
||||
|
||||
return cls(
|
||||
grammar_str=choice_str,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
@ -291,6 +268,13 @@ class GrammarConfig:
|
||||
grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
|
||||
return grammar
|
||||
|
||||
@staticmethod
|
||||
def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
|
||||
return xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||
encoded_vocab=tokenizer_data.encoded_vocab,
|
||||
metadata=tokenizer_data.metadata,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class XGrammarLogitsProcessor:
|
||||
@ -299,11 +283,16 @@ class XGrammarLogitsProcessor:
|
||||
reasoner: Reasoner | None = None
|
||||
|
||||
ctx: xgr.CompiledGrammar | None = None
|
||||
tokenizer_info: xgr.TokenizerInfo = None # type: ignore[assignment]
|
||||
token_bitmask: torch.Tensor = None # type: ignore[assignment]
|
||||
matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
|
||||
batch_size: int = field(default=1)
|
||||
prefilled: bool = field(default=False)
|
||||
|
||||
def __post_init__(self):
|
||||
self.tokenizer_info = self.config.tokenizer_info(
|
||||
self.config.tokenizer_data)
|
||||
|
||||
def __getstate__(self) -> dict[str, Any]:
|
||||
return {'config': self.config, 'reasoner': self.reasoner}
|
||||
|
||||
@ -311,6 +300,8 @@ class XGrammarLogitsProcessor:
|
||||
self.config = state['config']
|
||||
self.reasoner = state['reasoner']
|
||||
|
||||
self.tokenizer_info = GrammarConfig.tokenizer_info(
|
||||
self.config.tokenizer_data)
|
||||
self.ctx = None
|
||||
self.matchers = []
|
||||
self.batch_size = 1
|
||||
@ -352,7 +343,7 @@ class XGrammarLogitsProcessor:
|
||||
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
|
||||
]
|
||||
self.token_bitmask = xgr.allocate_token_bitmask(
|
||||
self.batch_size, self.config.vocab_size)
|
||||
self.batch_size, self.tokenizer_info.vocab_size)
|
||||
|
||||
if not self.prefilled:
|
||||
# Have not sampled a token yet
|
||||
|
@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||
SupportsMultiModal, SupportsPP)
|
||||
SupportsMultiModal, SupportsPP, SupportsV0Only)
|
||||
from .siglip import SiglipVisionModel
|
||||
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
|
||||
maybe_prefix, merge_multimodal_embeddings)
|
||||
@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module):
|
||||
info=Gemma3ProcessingInfo,
|
||||
dummy_inputs=Gemma3DummyInputsBuilder)
|
||||
class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
SupportsLoRA):
|
||||
SupportsLoRA, SupportsV0Only):
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
|
@ -111,6 +111,7 @@ class MixtralAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: MixtralConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
@ -136,7 +137,9 @@ class MixtralAttention(nn.Module):
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
self.head_dim = hidden_size // self.total_num_heads
|
||||
# MixtralConfig has an optional head_dim argument
|
||||
self.head_dim = getattr(config, "head_dim",
|
||||
self.hidden_size // self.total_num_heads)
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -200,6 +203,7 @@ class MixtralDecoderLayer(nn.Module):
|
||||
# Requires transformers > 4.32.0
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
self.self_attn = MixtralAttention(
|
||||
config=config,
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
max_position=config.max_position_embeddings,
|
||||
|
@ -165,6 +165,7 @@ class MixtralAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: MixtralConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
@ -190,7 +191,9 @@ class MixtralAttention(nn.Module):
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
self.head_dim = hidden_size // self.total_num_heads
|
||||
# MixtralConfig has an optional head_dim argument
|
||||
self.head_dim = getattr(config, "head_dim",
|
||||
self.hidden_size // self.total_num_heads)
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -252,6 +255,7 @@ class MixtralDecoderLayer(nn.Module):
|
||||
# Requires transformers > 4.32.0
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
self.self_attn = MixtralAttention(
|
||||
config=config,
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
max_position=config.max_position_embeddings,
|
||||
|
@ -56,6 +56,8 @@ try:
|
||||
except ImportError:
|
||||
USE_XFORMERS_OPS = False
|
||||
|
||||
PATCH_MERGE = "patch_merge"
|
||||
|
||||
|
||||
class PixtralImagePixelInputs(TypedDict):
|
||||
type: Literal["pixel_values"]
|
||||
@ -71,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
|
||||
"""
|
||||
A boolean mask indicating which image embeddings correspond
|
||||
to patch tokens.
|
||||
|
||||
|
||||
Shape: `(batch_size, num_images, num_embeds)`
|
||||
"""
|
||||
|
||||
@ -155,7 +157,6 @@ class PixtralProcessorAdapter:
|
||||
|
||||
for image in images:
|
||||
image_inputs = self.image_processor(ImageChunk(image=image))
|
||||
|
||||
image_processed = torch.tensor(image_inputs.image)
|
||||
image_tokens = torch.tensor(image_inputs.tokens)
|
||||
|
||||
@ -353,6 +354,27 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
)
|
||||
|
||||
self.vision_encoder = VisionTransformer(self.vision_args)
|
||||
|
||||
if self.vision_args.add_pre_mm_projector_layer_norm:
|
||||
self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
|
||||
eps=1e-5)
|
||||
|
||||
if self.vision_args.mm_projector_id == PATCH_MERGE:
|
||||
self.patch_merger = PatchMerger(
|
||||
vision_encoder_dim=self.vision_args.hidden_size,
|
||||
spatial_merge_size=self.vision_args.spatial_merge_size,
|
||||
use_mlp_bias=False,
|
||||
)
|
||||
if self.vision_args.add_pre_mm_projector_layer_norm:
|
||||
self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
|
||||
eps=1e-5)
|
||||
|
||||
if self.vision_args.mm_projector_id == PATCH_MERGE:
|
||||
self.patch_merger = PatchMerger(
|
||||
vision_encoder_dim=self.vision_args.hidden_size,
|
||||
spatial_merge_size=self.vision_args.spatial_merge_size,
|
||||
use_mlp_bias=False,
|
||||
)
|
||||
self.vision_language_adapter = VisionLanguageAdapter(
|
||||
self.vision_args, dim=config.text_config.hidden_size)
|
||||
|
||||
@ -398,13 +420,25 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
image_input: PixtralImagePixelInputs,
|
||||
) -> tuple[torch.Tensor, ...]:
|
||||
images = image_input["images"]
|
||||
|
||||
image_features = self.vision_encoder(images)
|
||||
feature_sizes = [
|
||||
image_feature.shape[0] for image_feature in image_features
|
||||
]
|
||||
|
||||
image_embeds = self.vision_language_adapter(torch.cat(image_features))
|
||||
image_features = torch.cat(image_features)
|
||||
if self.vision_args.add_pre_mm_projector_layer_norm:
|
||||
image_features = self.pre_mm_projector_norm(image_features)
|
||||
if self.vision_args.mm_projector_id == PATCH_MERGE:
|
||||
patch_size = self.vision_args.patch_size
|
||||
spatial_merge_size_square = self.vision_args.spatial_merge_size**2
|
||||
img_patch_dims = [(img.shape[1] // patch_size,
|
||||
img.shape[2] // patch_size) for img in images]
|
||||
feature_sizes = [
|
||||
feature_size // spatial_merge_size_square
|
||||
for feature_size in feature_sizes
|
||||
]
|
||||
image_features = self.patch_merger(image_features,
|
||||
image_sizes=img_patch_dims)
|
||||
image_embeds = self.vision_language_adapter(image_features)
|
||||
image_embeds = torch.split(image_embeds, feature_sizes)
|
||||
return image_embeds
|
||||
|
||||
@ -524,8 +558,19 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]):
|
||||
return weight[0].startswith("vision_language_adapter")
|
||||
|
||||
def is_patch_merger(weight: Tuple[str, torch.Tensor]):
|
||||
return weight[0].startswith("patch_merger")
|
||||
|
||||
def is_pre_mm_projector_norm(weight: Tuple[str, torch.Tensor]):
|
||||
return weight[0].startswith("pre_mm_projector_norm")
|
||||
|
||||
# Get references to parameters for direct loading
|
||||
vision_encoder_dict = dict(self.vision_encoder.named_parameters())
|
||||
patch_merger_dict = dict(self.patch_merger.named_parameters(
|
||||
)) if self.vision_args.mm_projector_id == PATCH_MERGE else dict()
|
||||
pre_mm_projector_norm_dict = dict(
|
||||
self.pre_mm_projector_norm.named_parameters(
|
||||
)) if self.vision_args.add_pre_mm_projector_layer_norm else dict()
|
||||
vision_lang_adapter_dict = dict(
|
||||
self.vision_language_adapter.named_parameters())
|
||||
|
||||
@ -538,6 +583,18 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
param = vision_encoder_dict[trimmed_name]
|
||||
with torch.no_grad():
|
||||
default_weight_loader(param, w)
|
||||
elif is_patch_merger((name, w)):
|
||||
# Load vision patch merger weights directly
|
||||
trimmed_name = '.'.join(name.split(".")[1:])
|
||||
param = patch_merger_dict[trimmed_name]
|
||||
with torch.no_grad():
|
||||
default_weight_loader(param, w)
|
||||
elif is_pre_mm_projector_norm((name, w)):
|
||||
# Load vision pre_mm_projector_norm weights directly
|
||||
trimmed_name = '.'.join(name.split(".")[1:])
|
||||
param = pre_mm_projector_norm_dict[trimmed_name]
|
||||
with torch.no_grad():
|
||||
default_weight_loader(param, w)
|
||||
elif is_vision_lang_adapter_weights((name, w)):
|
||||
# Load vision-language adapter weights directly
|
||||
trimmed_name = '.'.join(name.split(".")[1:])
|
||||
@ -566,6 +623,9 @@ class VisionEncoderArgs:
|
||||
rope_theta: float # for rope-2D
|
||||
image_token_id: int
|
||||
adapter_bias: bool = True
|
||||
spatial_merge_size: int = 1
|
||||
add_pre_mm_projector_layer_norm: bool = False
|
||||
mm_projector_id: str = ""
|
||||
|
||||
|
||||
def _reshape_for_broadcast(freqs_cis: torch.Tensor,
|
||||
@ -789,10 +849,10 @@ class VisionTransformer(nn.Module):
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
images: list of N_img images of variable sizes,
|
||||
images: list of N_img images of variable sizes,
|
||||
each of shape (C, H, W)
|
||||
Returns:
|
||||
image_features: tensor of token features for
|
||||
image_features: tensor of token features for
|
||||
all tokens of all images of shape (N_toks, D)
|
||||
"""
|
||||
# pass images through initial convolution independently
|
||||
@ -843,6 +903,105 @@ class VisionLanguageAdapter(nn.Module):
|
||||
return self.w_out(self.gelu(self.w_in(x)))
|
||||
|
||||
|
||||
class PatchMerger(nn.Module):
|
||||
"""
|
||||
Learned merging of spatial_merge_size ** 2 patches
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_encoder_dim: int,
|
||||
spatial_merge_size: int,
|
||||
use_mlp_bias: bool = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
mlp_input_dim = vision_encoder_dim * (spatial_merge_size**2)
|
||||
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.mlp_input_dim = mlp_input_dim
|
||||
|
||||
self.merging_layer = nn.Linear(
|
||||
mlp_input_dim,
|
||||
vision_encoder_dim,
|
||||
bias=use_mlp_bias,
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor,
|
||||
image_sizes: list[tuple[int, int]]) -> torch.Tensor:
|
||||
# image_sizes specified in tokens
|
||||
assert sum([h * w for h, w in image_sizes]) == len(x)
|
||||
|
||||
# x is (N, vision_encoder_dim)
|
||||
x = self.permute(x, image_sizes)
|
||||
|
||||
# x is (N / spatial_merge_size ** 2,
|
||||
# vision_encoder_dim * spatial_merge_size ** 2)
|
||||
x = self.merging_layer(x)
|
||||
|
||||
# x is (N / spatial_merge_size ** 2, vision_encoder_dim)
|
||||
return x
|
||||
|
||||
def permute(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
image_sizes: list[tuple[int, int]],
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
x: (N, D) where N is flattened and concatenated patch tokens
|
||||
for all images
|
||||
image_sizes: list of tuple of (height, width) in tokens for
|
||||
each image
|
||||
Returns:
|
||||
image_features: reorders patch tokens so each grid of
|
||||
(spatial_merge_size, spatial_merge_size) is contiguous.
|
||||
now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
|
||||
"""
|
||||
|
||||
sub_grids = get_sub_grids(
|
||||
x=x,
|
||||
image_sizes=image_sizes,
|
||||
spatial_merge_size=self.spatial_merge_size
|
||||
) # list of [d x sub_grid_size x sub_grid_size x n_patches]
|
||||
permuted_tensor: list[torch.Tensor] = []
|
||||
for grid in sub_grids:
|
||||
n_patches = grid.shape[-1]
|
||||
permuted_tensor.append(grid.view(-1, n_patches).t(
|
||||
)) # n_patches x d * sub_grid_size * sub_grid_size
|
||||
return torch.cat(
|
||||
permuted_tensor, dim=0
|
||||
) # (N / spatial_merge_size ** 2, d * spatial_merge_size ** 2)
|
||||
|
||||
|
||||
def get_sub_grids(
|
||||
x: torch.Tensor,
|
||||
image_sizes: list[tuple[int, int]],
|
||||
spatial_merge_size: int,
|
||||
) -> list[torch.Tensor]:
|
||||
# image_sizes specified in tokens
|
||||
tokens_per_image = [h * w for h, w in image_sizes]
|
||||
d = x.shape[-1]
|
||||
all_img_sub_grids: list[torch.Tensor] = []
|
||||
sub_grid_size = spatial_merge_size
|
||||
|
||||
for image_index, image_tokens in enumerate(x.split(tokens_per_image)):
|
||||
# Reshape image_tokens into a 2D grid
|
||||
h, w = image_sizes[image_index]
|
||||
image_grid = image_tokens.view(h, w, d).permute(
|
||||
2, 0, 1)[None, :, :, :] # 1 x d x h x w
|
||||
sub_grids = torch.nn.functional.unfold(image_grid,
|
||||
kernel_size=sub_grid_size,
|
||||
stride=sub_grid_size)
|
||||
sub_grids = sub_grids.view(
|
||||
1, d, sub_grid_size, sub_grid_size,
|
||||
-1) # 1 x d x sub_grid_size x sub_grid_size x n_patches
|
||||
|
||||
all_img_sub_grids.append(sub_grids[0])
|
||||
|
||||
return all_img_sub_grids
|
||||
|
||||
|
||||
#### HF Transformers version of Pixtral ####
|
||||
# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py
|
||||
# This model follows the Llava family, meaning image embeddings are placed
|
||||
|
@ -5,7 +5,7 @@
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import cached_property
|
||||
from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
@ -36,7 +36,7 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||
|
||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||
SupportsMultiModal, SupportsPP, SupportsV0Only)
|
||||
SupportsMultiModal, SupportsPP)
|
||||
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
|
||||
init_vllm_registered_model, maybe_prefix,
|
||||
merge_multimodal_embeddings,
|
||||
@ -50,14 +50,14 @@ _MAX_ENCODER_BATCH_SIZE = 16
|
||||
|
||||
class UltravoxAudioFeatureInputs(TypedDict):
|
||||
type: Literal["audio_features"]
|
||||
data: NestedTensors
|
||||
data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
|
||||
"""Shape: `(batch_size, num_chunks, 80, M)`"""
|
||||
lens: NestedTensors
|
||||
lens: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
Length of the audio frames. Used for attention mask in WhisperEncoder.
|
||||
Shape: `(batch_size, num_chunks)`
|
||||
"""
|
||||
token_len: NestedTensors
|
||||
token_len: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
Length of the audio tokens. Used for flattening the audio features.
|
||||
Shape: `(batch_size, num_chunks)`
|
||||
@ -405,8 +405,7 @@ class ModifiedWhisperEncoder(WhisperEncoder):
|
||||
UltravoxMultiModalProcessor,
|
||||
info=UltravoxProcessingInfo,
|
||||
dummy_inputs=UltravoxDummyInputsBuilder)
|
||||
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
SupportsV0Only):
|
||||
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
||||
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||
@ -506,6 +505,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
if not isinstance(audio_features, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of audio features. "
|
||||
f"Got type: {type(audio_features)}")
|
||||
if not isinstance(audio_lens, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of audio_lens. "
|
||||
f"Got type: {type(audio_features)}")
|
||||
if not isinstance(audio_token_len, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of audio_token_len. "
|
||||
f"Got type: {type(audio_features)}")
|
||||
|
||||
return UltravoxAudioFeatureInputs(type="audio_features",
|
||||
data=audio_features,
|
||||
@ -523,7 +528,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
raise AssertionError("This line should be unreachable.")
|
||||
|
||||
def _process_audio_input(
|
||||
self, audio_input: UltravoxAudioInputs) -> NestedTensors:
|
||||
self,
|
||||
audio_input: UltravoxAudioInputs,
|
||||
) -> Union[NestedTensors, tuple[torch.Tensor, ...]]:
|
||||
if audio_input["type"] == "audio_embeds":
|
||||
return audio_input["data"]
|
||||
|
||||
@ -531,13 +538,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
|
||||
audio_features = pad_and_concat_to_dim3(audio_input["data"])
|
||||
|
||||
if isinstance(audio_input['lens'], list):
|
||||
# [B1, B2] -> [B1+B2]
|
||||
audio_lens = torch.cat(audio_input['lens'])
|
||||
audio_token_len = torch.cat(audio_input['token_len'])
|
||||
else:
|
||||
audio_lens = flatten_bn(audio_input['lens'])
|
||||
audio_token_len = flatten_bn(audio_input['token_len'])
|
||||
# [B1, B2] -> [B1+B2]
|
||||
audio_lens = flatten_bn(audio_input['lens'], concat=True)
|
||||
audio_token_len = flatten_bn(audio_input['token_len'], concat=True)
|
||||
|
||||
embeddings = self._audio_features_to_embeddings(
|
||||
audio_features, audio_lens)
|
||||
@ -554,7 +557,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
# Apply mask and flatten
|
||||
flattened_embeddings = embeddings[mask]
|
||||
|
||||
return flattened_embeddings
|
||||
# Return one tensor per input audio
|
||||
embed_lens = [
|
||||
token_len_item.sum().item()
|
||||
for token_len_item in audio_input['token_len']
|
||||
]
|
||||
return flattened_embeddings.split(embed_lens)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
@ -646,7 +654,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
|
||||
|
||||
def pad_and_concat_to_dim3(
|
||||
features: Union[torch.Tensor, List[torch.Tensor], List[List[torch.Tensor]]]
|
||||
features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Pad and concatenate a list of tensors.
|
||||
|
@ -218,8 +218,10 @@ class MultiModalProfiler(Generic[_I]):
|
||||
|
||||
# V0 does not support chunked prefill.
|
||||
if total_len > seq_len and not envs.VLLM_USE_V1:
|
||||
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
||||
logger.warning(
|
||||
"The context length (%d) of the model is too short "
|
||||
"The sequence length used for profiling ("
|
||||
"max_num_batched_tokens / max_num_seqs = %d) is too short "
|
||||
"to hold the multi-modal embeddings in the worst case "
|
||||
"(%d tokens in total, out of which %s are reserved for "
|
||||
"multi-modal embeddings). This may cause certain "
|
||||
|
@ -37,10 +37,11 @@ class XPUPlatform(Platform):
|
||||
return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
|
||||
|
||||
@staticmethod
|
||||
def get_device_capability(device_id: int = 0) -> DeviceCapability:
|
||||
major, minor, *_ = torch.xpu.get_device_capability(
|
||||
device_id)['version'].split('.')
|
||||
return DeviceCapability(major=int(major), minor=int(minor))
|
||||
def get_device_capability(
|
||||
device_id: int = 0) -> Optional[DeviceCapability]:
|
||||
# capacity format differs from cuda's and will cause unexpected
|
||||
# failure, so use None directly
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_device_name(device_id: int = 0) -> str:
|
||||
|
@ -52,7 +52,7 @@ class EngineCoreRequest(
|
||||
# Detokenizer, but set to None when it is added to EngineCoreClient.
|
||||
prompt: Optional[str]
|
||||
prompt_token_ids: list[int]
|
||||
mm_inputs: Optional[list[Optional[MultiModalKwargs]]]
|
||||
mm_inputs: Optional[list[MultiModalKwargs]]
|
||||
mm_hashes: Optional[list[str]]
|
||||
mm_placeholders: Optional[list[PlaceholderRange]]
|
||||
sampling_params: SamplingParams
|
||||
|
@ -1,131 +1,30 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
|
||||
MultiModalKwargs, MultiModalRegistry)
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.multimodal.processing import ProcessingCache
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# The idea of multimodal preprocessing caching is based on having a client and
|
||||
# a server, where the client executes in the frontend process (=P0) and the
|
||||
# server in the core process (=P1).
|
||||
#
|
||||
# -- Client:
|
||||
# - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs.
|
||||
# - Perform caching of the generated MultiModalKwargs.
|
||||
# - This client can be deprecated once all mutimodal models migrate to use
|
||||
# merged preprocessor with built-in caching functionality.
|
||||
# - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
|
||||
# with built-in caching functionality, with mm_hash as its identifier.
|
||||
#
|
||||
# -- Server:
|
||||
# - Perform caching of the received MultiModalKwargs.
|
||||
# - MMInputCacheServer to perform caching of the received MultiModalKwargs.
|
||||
#
|
||||
# The caching for both client and server is mirrored/similar, and this allows us
|
||||
# The caching for both client and server is mirrored, and this allows us
|
||||
# to avoid the serialization of "mm_inputs" (like pixel values) between
|
||||
# client (=P0) and server (=P1) processes.
|
||||
# client (=P0) and server (=P1) processes if the mm_hash is found in the client
|
||||
# cache.
|
||||
|
||||
# Both Client and Server must use the same cache size
|
||||
# (to perform mirrored caching). This cache size is set by the environment
|
||||
# variable VLLM_MM_INPUT_CACHE_GIB.
|
||||
|
||||
|
||||
# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
|
||||
# merged preprocessor with built-in caching functionality.
|
||||
class MMInputCacheClient:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
):
|
||||
self.model_config = model_config
|
||||
self.mm_registry = mm_registry
|
||||
self.multi_modal_input_mapper = mm_registry.create_input_mapper(
|
||||
model_config)
|
||||
self.mm_registry.init_mm_limits_per_prompt(model_config)
|
||||
|
||||
# Init cache
|
||||
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
||||
self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
|
||||
MultiModalKwargs)
|
||||
|
||||
# DEBUG: Set to None to disable
|
||||
self.mm_debug_cache_hit_ratio_steps = None
|
||||
self.mm_debug_cache_hits = 0
|
||||
self.mm_debug_cache_total = 0
|
||||
|
||||
def cache_hit_ratio(self, steps):
|
||||
total = self.mm_debug_cache_total
|
||||
|
||||
if total > 0 and total % steps == 0:
|
||||
logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
|
||||
self.mm_debug_cache_hits / total)
|
||||
|
||||
# NOTE: process_inputs only supports image inputs since all multimodal
|
||||
# models with other modalities have migrated to use merged preprocessor.
|
||||
def process_inputs(
|
||||
self,
|
||||
mm_data: MultiModalDataDict,
|
||||
mm_hashes: Optional[list[str]],
|
||||
mm_processor_kwargs: Optional[dict[str, Any]],
|
||||
precomputed_mm_inputs: Optional[list[MultiModalKwargs]],
|
||||
) -> list[Optional[MultiModalKwargs]]:
|
||||
if precomputed_mm_inputs is None:
|
||||
image_inputs = mm_data["image"]
|
||||
if not isinstance(image_inputs, list):
|
||||
image_inputs = [image_inputs]
|
||||
num_inputs = len(image_inputs)
|
||||
else:
|
||||
num_inputs = len(precomputed_mm_inputs)
|
||||
|
||||
# Sanity
|
||||
if self.use_cache:
|
||||
assert mm_hashes is not None
|
||||
assert num_inputs == len(mm_hashes)
|
||||
|
||||
# Process each image input separately, so that later we can schedule
|
||||
# them in a fine-grained manner.
|
||||
# Apply caching (if enabled) and reuse precomputed inputs (if provided)
|
||||
ret_inputs: list[Optional[MultiModalKwargs]] = []
|
||||
for input_id in range(num_inputs):
|
||||
if self.mm_debug_cache_hit_ratio_steps is not None:
|
||||
self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
|
||||
|
||||
mm_input = None
|
||||
if self.use_cache:
|
||||
assert mm_hashes is not None
|
||||
mm_hash = mm_hashes[input_id]
|
||||
mm_input = self.mm_cache.get(mm_hash)
|
||||
|
||||
self.mm_debug_cache_total += 1
|
||||
if mm_input is None:
|
||||
if precomputed_mm_inputs is not None:
|
||||
# Reuse precomputed input (for merged preprocessor)
|
||||
mm_input = precomputed_mm_inputs[input_id]
|
||||
else:
|
||||
# Apply legacy input_mapper
|
||||
mm_input = self.multi_modal_input_mapper(
|
||||
{"image": [image_inputs[input_id]]},
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
if self.use_cache:
|
||||
# Add to cache
|
||||
assert mm_hash is not None
|
||||
self.mm_cache[mm_hash] = mm_input
|
||||
else:
|
||||
self.mm_debug_cache_hits += 1
|
||||
mm_input = None # Avoids sending mm_input to Server
|
||||
|
||||
ret_inputs.append(mm_input)
|
||||
|
||||
return ret_inputs
|
||||
|
||||
|
||||
class MMInputCacheServer:
|
||||
|
||||
def __init__(self, model_config):
|
||||
@ -135,9 +34,9 @@ class MMInputCacheServer:
|
||||
|
||||
def get_and_update(
|
||||
self,
|
||||
mm_inputs: list[Optional[MultiModalKwargs]],
|
||||
mm_inputs: list[MultiModalKwargs],
|
||||
mm_hashes: list[str],
|
||||
) -> list[Optional[MultiModalKwargs]]:
|
||||
) -> list[MultiModalKwargs]:
|
||||
assert len(mm_inputs) == len(mm_hashes)
|
||||
|
||||
if not self.use_cache:
|
||||
@ -147,8 +46,7 @@ class MMInputCacheServer:
|
||||
for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
|
||||
assert mm_hash is not None
|
||||
if mm_input is None:
|
||||
mm_input = self.mm_cache.get(mm_hash)
|
||||
assert mm_input is not None
|
||||
mm_input = self.mm_cache[mm_hash]
|
||||
else:
|
||||
self.mm_cache[mm_hash] = mm_input
|
||||
|
||||
|
@ -11,15 +11,15 @@ from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
|
||||
from vllm.inputs.parse import is_encoder_decoder_inputs
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
|
||||
MultiModalKwargs, MultiModalRegistry)
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
|
||||
MultiModalRegistry)
|
||||
from vllm.multimodal.inputs import PlaceholderRange
|
||||
from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.mm_input_cache import MMInputCacheClient
|
||||
from vllm.v1.structured_output.utils import validate_structured_output_request
|
||||
|
||||
|
||||
@ -45,11 +45,6 @@ class Processor:
|
||||
self.input_preprocessor = InputPreprocessor(self.model_config,
|
||||
self.tokenizer,
|
||||
mm_registry)
|
||||
self.input_processor = input_registry.create_input_processor(
|
||||
self.model_config)
|
||||
|
||||
# Multi-modal (huggingface) input mapper
|
||||
self.mm_input_cache_client = MMInputCacheClient(self.model_config)
|
||||
|
||||
# Multi-modal hasher (for images)
|
||||
self.use_hash = (
|
||||
@ -171,7 +166,7 @@ class Processor:
|
||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||
# multimodal data and expand prompt token ids accordingly.
|
||||
# 3. Apply prompt adapter to prompt token ids if one exists.
|
||||
preprocessed_inputs = self.input_preprocessor.preprocess(
|
||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
@ -180,10 +175,6 @@ class Processor:
|
||||
)
|
||||
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
||||
|
||||
# Process prompt and prompt token ids.
|
||||
# Only applicable to multimodal models with legacy input processor.
|
||||
processed_inputs = self.input_processor(preprocessed_inputs)
|
||||
|
||||
self._validate_model_inputs(processed_inputs, lora_request)
|
||||
|
||||
if is_encoder_decoder_inputs(processed_inputs):
|
||||
@ -212,36 +203,22 @@ class Processor:
|
||||
self.tokenizer.get_lora_tokenizer(lora_request))
|
||||
|
||||
# Multimodal related.
|
||||
# Compute MM hashes (if enabled)
|
||||
mm_hashes = None
|
||||
if self.use_hash:
|
||||
# Use mm_hashes from processed inputs if the model has merged
|
||||
# input processor.
|
||||
if decoder_inputs.multi_modal_hashes:
|
||||
mm_hashes = decoder_inputs.multi_modal_hashes
|
||||
# Fallback to using MultiModalHasher directly.
|
||||
else:
|
||||
mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
|
||||
sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
|
||||
sorted_mm_positions: Optional[list[PlaceholderRange]] = None
|
||||
sorted_mm_hashes: Optional[list[str]] = None
|
||||
if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
|
||||
assert isinstance(decoder_mm_inputs, MultiModalKwargs)
|
||||
|
||||
# For merged preprocessor, mm_data is already mm_inputs
|
||||
precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
|
||||
decoder_mm_data = decoder_inputs.multi_modal_data
|
||||
if isinstance(decoder_mm_data, MultiModalKwargs):
|
||||
# The output of merged multi-modal processor (`decoder_mm_data`)
|
||||
# The output of merged multi-modal processor (`decoder_mm_inputs`)
|
||||
# contains the kwargs for all items from all modalities.
|
||||
# This code separates them so that there is one set of kwargs
|
||||
# per item per modality.
|
||||
precomputed_mm_inputs = [
|
||||
individual_mm_inputs = [
|
||||
MultiModalKwargs.from_items([item])
|
||||
for modality in decoder_mm_data.modalities
|
||||
for item in decoder_mm_data.get_items(modality)
|
||||
for modality in decoder_mm_inputs.modalities
|
||||
for item in decoder_mm_inputs.get_items(modality)
|
||||
]
|
||||
|
||||
mm_positions = decoder_inputs.multi_modal_placeholders
|
||||
|
||||
# Last-mile processing of multimodal metadata and inputs.
|
||||
if mm_positions:
|
||||
|
||||
# Merge and flatten multimodal placeholders, hashes and inputs
|
||||
# from dictionaries to lists, and sort them by each item's position
|
||||
# in the input sequence.
|
||||
@ -251,14 +228,13 @@ class Processor:
|
||||
sorted_mm_positions,
|
||||
sorted_mm_hashes,
|
||||
) = merge_and_sort_multimodal_metadata(
|
||||
mm_positions,
|
||||
mm_hashes,
|
||||
decoder_inputs.multi_modal_placeholders,
|
||||
decoder_inputs.multi_modal_hashes if self.use_hash else None,
|
||||
)
|
||||
|
||||
# NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
|
||||
# modalities involved AND the model supports merged input processor.
|
||||
if len(sorted_modalities) > 1 and precomputed_mm_inputs:
|
||||
|
||||
# modalities involved.
|
||||
if len(sorted_modalities) > 1:
|
||||
modality_order_dict = {
|
||||
modality: order
|
||||
for order, modality in enumerate(sorted_modalities)
|
||||
@ -266,26 +242,16 @@ class Processor:
|
||||
|
||||
# Sanity check to make sure each multimodal input has only one
|
||||
# modality key.
|
||||
for mm_input in precomputed_mm_inputs:
|
||||
for mm_input in individual_mm_inputs:
|
||||
assert len(mm_input.modalities) == 1
|
||||
|
||||
# Sort MultiModalKwags to match sorted_mm_positions
|
||||
precomputed_mm_inputs = sorted(
|
||||
precomputed_mm_inputs,
|
||||
# Sort MultiModalKwargs to match sorted_mm_positions
|
||||
sorted_mm_inputs = sorted(
|
||||
individual_mm_inputs,
|
||||
key=lambda mm_input: modality_order_dict[list(
|
||||
mm_input.modalities)[0]])
|
||||
|
||||
# Apply mm input cache update and legacy input mapper if one exists.
|
||||
sorted_mm_inputs = self.mm_input_cache_client.process_inputs(
|
||||
mm_data=decoder_mm_data,
|
||||
mm_hashes=sorted_mm_hashes,
|
||||
mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
|
||||
precomputed_mm_inputs=precomputed_mm_inputs,
|
||||
)
|
||||
else:
|
||||
sorted_mm_inputs = None
|
||||
sorted_mm_hashes = None
|
||||
sorted_mm_positions = None
|
||||
else:
|
||||
sorted_mm_inputs = individual_mm_inputs
|
||||
|
||||
return EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
|
@ -40,7 +40,7 @@ class StructuredOutputManager:
|
||||
tokenizer_group.ping()
|
||||
|
||||
tokenizer = tokenizer_group.get_lora_tokenizer(None)
|
||||
self.vocab_size = len(tokenizer.get_vocab())
|
||||
self.vocab_size = self.vllm_config.model_config.get_vocab_size()
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
# NOTE: ideally, xgrammar should handle this accordingly.
|
||||
# refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
|
||||
|
@ -29,7 +29,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||
is_pin_memory_available)
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
|
||||
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
||||
from vllm.v1.engine.mm_input_cache import MMInputCacheClient
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheSpec)
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
|
||||
@ -133,14 +132,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.mm_registry = MULTIMODAL_REGISTRY
|
||||
self.uses_mrope = model_config.uses_mrope
|
||||
|
||||
if self.is_multimodal_model:
|
||||
# NOTE: Initialized client is only used for processing dummy
|
||||
# multimodal data into multimodal kwargs for GPU memory profiling.
|
||||
# Only applicable to multimodal models with legacy input mapper.
|
||||
self.mm_input_mapper_profiling = MMInputCacheClient(
|
||||
self.model_config)
|
||||
self.mm_input_mapper_profiling.use_cache = False
|
||||
|
||||
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
|
||||
model_config=model_config,
|
||||
scheduler_config=scheduler_config,
|
||||
@ -1376,32 +1367,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
mm_registry=self.mm_registry,
|
||||
)
|
||||
dummy_mm_data = dummy_request_data.multi_modal_data
|
||||
if not isinstance(dummy_mm_data, MultiModalKwargs):
|
||||
# TODO: Delete this check once input mapper is fully removed.
|
||||
raise RuntimeError(
|
||||
"Legacy input mapper is not supported in V1")
|
||||
|
||||
# Dummy data definition in V0 may contain multiple multimodal items
|
||||
# Dummy data definition may contain multiple multimodal items
|
||||
# (e.g, multiple images) for a single request, therefore here we
|
||||
# always replicate first item by max_num_mm_items times since in V1
|
||||
# they are scheduled to be processed separately.
|
||||
|
||||
# Case when models have a merged processor, their dummy data is
|
||||
# already batched `MultiModalKwargs`, therefore we take the first
|
||||
# `MultiModalKwargsItem` from the desired modality to profile on.
|
||||
if isinstance(dummy_mm_data, MultiModalKwargs):
|
||||
dummy_mm_item = dummy_mm_data.get_item(
|
||||
modality=dummy_data_modality, item_index=0)
|
||||
dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
|
||||
|
||||
# Case when models have dummy data explicitly defined as
|
||||
# `MultiModalDataDict`, so they need to be processed through input
|
||||
# mapper.
|
||||
# TODO (ywang96): deprecate this path once merged processor is
|
||||
# supported on all models.
|
||||
else:
|
||||
mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
|
||||
mm_data=dummy_mm_data,
|
||||
mm_hashes=None,
|
||||
mm_processor_kwargs=None,
|
||||
precomputed_mm_inputs=None)
|
||||
dummy_mm_kwargs = mm_kwargs_list[0]
|
||||
dummy_mm_item = dummy_mm_data.get_item(
|
||||
modality=dummy_data_modality, item_index=0)
|
||||
dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
|
||||
|
||||
batched_dummy_mm_inputs = MultiModalKwargs.batch(
|
||||
[dummy_mm_kwargs] * max_num_mm_items)
|
||||
|
@ -23,8 +23,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
|
||||
from vllm.sampling_params import SamplingType
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
|
||||
from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
|
||||
PallasAttentionBackend,
|
||||
from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
|
||||
PallasMetadata)
|
||||
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
@ -139,10 +138,8 @@ class TPUModelRunner:
|
||||
device="cpu")
|
||||
self.slot_mapping_np = self.slot_mapping_cpu.numpy()
|
||||
|
||||
padded_max_num_blocks_per_req = _get_padded_number(
|
||||
self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
|
||||
self.block_table_cpu = torch.zeros(
|
||||
(self.max_num_tokens, padded_max_num_blocks_per_req),
|
||||
(self.max_num_tokens, self.max_num_blocks_per_req),
|
||||
dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
|
||||
device="cpu")
|
||||
|
||||
|
Reference in New Issue
Block a user