mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Remove V0 Encoder-Decoder Support (#24907)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
This commit is contained in:
@ -66,7 +66,6 @@ function cpu_tests() {
|
||||
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
|
@ -549,15 +549,6 @@ steps:
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: Encoder Decoder tests # 12min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/encoder_decoder
|
||||
commands:
|
||||
- pytest -v -s encoder_decoder
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
|
@ -840,7 +840,6 @@ Some HF processors directly insert feature tokens without replacing anything in
|
||||
Examples:
|
||||
|
||||
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
||||
- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
|
||||
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
|
||||
### Handling prompt updates unrelated to multi-modal data
|
||||
|
@ -331,8 +331,6 @@ th {
|
||||
| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
|
||||
| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
@ -426,9 +424,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
!!! note
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
|
||||
!!! note
|
||||
Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
@ -625,9 +620,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
|
||||
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
@ -654,7 +647,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
|
||||
|
@ -120,7 +120,7 @@ Please note that prefix caching is not yet supported for any of the above models
|
||||
|
||||
Whisper is supported. Other models requiring cross-attention between separate
|
||||
encoder and decoder (e.g., `BartForConditionalGeneration`,
|
||||
`MllamaForConditionalGeneration`) are not yet supported.
|
||||
`MllamaForConditionalGeneration`) are not supported.
|
||||
|
||||
### Features
|
||||
|
||||
|
@ -1,311 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import regex as re
|
||||
from PIL import Image
|
||||
from transformers import DonutProcessor
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
@dataclass
|
||||
class ImageDimensions:
|
||||
original_w: int
|
||||
original_h: int
|
||||
padded_w: int
|
||||
padded_h: int
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def map_to_original_coordinates(
|
||||
x1, y1, x2, y2, dims: ImageDimensions
|
||||
) -> tuple[int, int, int, int]:
|
||||
try:
|
||||
top = (dims.padded_h - dims.original_h) // 2
|
||||
left = (dims.padded_w - dims.original_w) // 2
|
||||
orig_x1 = max(0, x1 - left)
|
||||
orig_y1 = max(0, y1 - top)
|
||||
orig_x2 = min(dims.original_w, x2 - left)
|
||||
orig_y2 = min(dims.original_h, y2 - top)
|
||||
if orig_x2 <= orig_x1:
|
||||
orig_x2 = min(orig_x1 + 1, dims.original_w)
|
||||
if orig_y2 <= orig_y1:
|
||||
orig_y2 = min(orig_y1 + 1, dims.original_h)
|
||||
return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
|
||||
except Exception as e:
|
||||
print(f"map_to_original_coordinates error: {str(e)}")
|
||||
return 0, 0, min(100, dims.original_w), min(100, dims.original_h)
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def adjust_box_edges(image, boxes: list[list[float]], max_pixels=15, threshold=0.2):
|
||||
if isinstance(image, str):
|
||||
image = cv2.imread(image)
|
||||
img_h, img_w = image.shape[:2]
|
||||
new_boxes = []
|
||||
for box in boxes:
|
||||
best_box = copy.deepcopy(box)
|
||||
|
||||
def check_edge(img, current_box, i, is_vertical):
|
||||
edge = current_box[i]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
_, binary = cv2.threshold(
|
||||
gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
|
||||
)
|
||||
if is_vertical:
|
||||
line = binary[current_box[1] : current_box[3] + 1, edge]
|
||||
else:
|
||||
line = binary[edge, current_box[0] : current_box[2] + 1]
|
||||
transitions = np.abs(np.diff(line))
|
||||
return np.sum(transitions) / len(transitions)
|
||||
|
||||
edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)]
|
||||
current_box = copy.deepcopy(box)
|
||||
current_box[0] = min(max(current_box[0], 0), img_w - 1)
|
||||
current_box[1] = min(max(current_box[1], 0), img_h - 1)
|
||||
current_box[2] = min(max(current_box[2], 0), img_w - 1)
|
||||
current_box[3] = min(max(current_box[3], 0), img_h - 1)
|
||||
|
||||
for i, direction, is_vertical in edges:
|
||||
best_score = check_edge(image, current_box, i, is_vertical)
|
||||
if best_score <= threshold:
|
||||
continue
|
||||
for step in range(max_pixels):
|
||||
current_box[i] += direction
|
||||
if i == 0 or i == 2:
|
||||
current_box[i] = min(max(current_box[i], 0), img_w - 1)
|
||||
else:
|
||||
current_box[i] = min(max(current_box[i], 0), img_h - 1)
|
||||
score = check_edge(image, current_box, i, is_vertical)
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_box = copy.deepcopy(current_box)
|
||||
if score <= threshold:
|
||||
break
|
||||
new_boxes.append(best_box)
|
||||
return new_boxes
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
|
||||
try:
|
||||
x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
|
||||
x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
|
||||
x1, y1, x2, y2 = (
|
||||
max(0, min(x1, dims.padded_w - 1)),
|
||||
max(0, min(y1, dims.padded_h - 1)),
|
||||
max(0, min(x2, dims.padded_w)),
|
||||
max(0, min(y2, dims.padded_h)),
|
||||
)
|
||||
if x2 <= x1:
|
||||
x2 = min(x1 + 1, dims.padded_w)
|
||||
if y2 <= y1:
|
||||
y2 = min(y1 + 1, dims.padded_h)
|
||||
new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
|
||||
x1, y1, x2, y2 = new_boxes[0]
|
||||
x1, y1, x2, y2 = (
|
||||
max(0, min(x1, dims.padded_w - 1)),
|
||||
max(0, min(y1, dims.padded_h - 1)),
|
||||
max(0, min(x2, dims.padded_w)),
|
||||
max(0, min(y2, dims.padded_h)),
|
||||
)
|
||||
if x2 <= x1:
|
||||
x2 = min(x1 + 1, dims.padded_w)
|
||||
if y2 <= y1:
|
||||
y2 = min(y1 + 1, dims.padded_h)
|
||||
if previous_box is not None:
|
||||
prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
|
||||
if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1):
|
||||
y1 = prev_y2
|
||||
y1 = min(y1, dims.padded_h - 1)
|
||||
if y2 <= y1:
|
||||
y2 = min(y1 + 1, dims.padded_h)
|
||||
new_previous_box = [x1, y1, x2, y2]
|
||||
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
|
||||
x1, y1, x2, y2, dims
|
||||
)
|
||||
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
|
||||
except Exception as e:
|
||||
print(f"process_coordinates error: {str(e)}")
|
||||
orig_x1, orig_y1, orig_x2, orig_y2 = (
|
||||
0,
|
||||
0,
|
||||
min(100, dims.original_w),
|
||||
min(100, dims.original_h),
|
||||
)
|
||||
return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def prepare_image(image) -> tuple[np.ndarray, ImageDimensions]:
|
||||
try:
|
||||
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
||||
original_h, original_w = image_cv.shape[:2]
|
||||
max_size = max(original_h, original_w)
|
||||
top = (max_size - original_h) // 2
|
||||
bottom = max_size - original_h - top
|
||||
left = (max_size - original_w) // 2
|
||||
right = max_size - original_w - left
|
||||
padded_image = cv2.copyMakeBorder(
|
||||
image_cv, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0)
|
||||
)
|
||||
padded_h, padded_w = padded_image.shape[:2]
|
||||
dimensions = ImageDimensions(
|
||||
original_w=original_w,
|
||||
original_h=original_h,
|
||||
padded_w=padded_w,
|
||||
padded_h=padded_h,
|
||||
)
|
||||
return padded_image, dimensions
|
||||
except Exception as e:
|
||||
print(f"prepare_image error: {str(e)}")
|
||||
h, w = image.height, image.width
|
||||
dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
|
||||
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def parse_layout_string(bbox_str):
|
||||
"""Parse layout string using regular expressions"""
|
||||
pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)"
|
||||
matches = re.finditer(pattern, bbox_str)
|
||||
|
||||
parsed_results = []
|
||||
for match in matches:
|
||||
coords = [float(match.group(i)) for i in range(1, 5)]
|
||||
label = match.group(5).strip()
|
||||
parsed_results.append((coords, label))
|
||||
|
||||
return parsed_results
|
||||
|
||||
|
||||
model_id = "ByteDance/Dolphin"
|
||||
|
||||
# The input image size for Dolphin is 896 x 896,
|
||||
# and the patch_size is 4 x 4.
|
||||
# Therefore, the initial number of patches is:
|
||||
# Height: 896 / 4 = 224 patches
|
||||
# Width: 896 / 4 = 224 patches
|
||||
|
||||
# The Dolphin model uses a staged downsampling approach,
|
||||
# defined by the "depths": [2, 2, 14, 2] configuration.
|
||||
# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
|
||||
# which halves the feature map's dimensions (dividing both height and width by 2).
|
||||
# Before Stage 2: The size changes from 224 x 224 to (224/2) x (224/2) = 112 x 112.
|
||||
# Before Stage 3: The size changes from 112 x 112 to (112/2) x (112/2) = 56 x 56.
|
||||
# Before Stage 4: The size changes from 56 x 56 to (56/2) x (56/2) = 28 x 28.
|
||||
|
||||
# Because vLLM needs to fill the image features with an encoder_prompt,
|
||||
# and the encoder_prompt will have `<pad>` tokens added when tokenized,
|
||||
# we need to construct an encoder_prompt with a length of 28 x 28 - 1 = 783.
|
||||
encoder_prompt = "".join(["0"] * 783)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=2048,
|
||||
)
|
||||
|
||||
processor = DonutProcessor.from_pretrained(model_id)
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype="float16",
|
||||
max_num_seqs=8,
|
||||
hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--image_path", type=str, default=None, help="Path to a local image file."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.image_path:
|
||||
if not os.path.exists(args.image_path):
|
||||
raise FileNotFoundError(f"Error: File not found at {args.image_path}")
|
||||
image = Image.open(args.image_path).convert("RGB")
|
||||
else:
|
||||
image = fetch_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
|
||||
)
|
||||
|
||||
|
||||
prompt = "Parse the reading order of this document. "
|
||||
decoder_prompt = f"<s>{prompt}<Answer/>"
|
||||
decoder_prompt_tokens = TokensPrompt(
|
||||
prompt_token_ids=processor.tokenizer(decoder_prompt, add_special_tokens=False)[
|
||||
"input_ids"
|
||||
]
|
||||
)
|
||||
enc_dec_prompt = ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=TextPrompt(prompt=encoder_prompt, multi_modal_data={"image": image}),
|
||||
decoder_prompt=decoder_prompt_tokens,
|
||||
)
|
||||
layout_outputs = llm.generate(prompts=enc_dec_prompt, sampling_params=sampling_params)
|
||||
layout_result_str = layout_outputs[0].outputs[0].text
|
||||
print(f"Layout analysis output:\n{layout_result_str}")
|
||||
|
||||
padded_image, dims = prepare_image(image)
|
||||
layout_results = parse_layout_string(layout_result_str)
|
||||
text_table_elements = []
|
||||
previous_box = None
|
||||
reading_order = 0
|
||||
for bbox_coords, label in layout_results:
|
||||
if label == "fig":
|
||||
continue
|
||||
try:
|
||||
x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = (
|
||||
process_coordinates(bbox_coords, padded_image, dims, previous_box)
|
||||
)
|
||||
cropped = padded_image[y1:y2, x1:x2]
|
||||
if cropped.size > 0 and cropped.shape[0] > 3 and cropped.shape[1] > 3:
|
||||
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
||||
prompt_ocr = (
|
||||
"Parse the table in the image. "
|
||||
if label == "tab"
|
||||
else "Read text in the image. "
|
||||
)
|
||||
text_table_elements.append(
|
||||
{
|
||||
"crop": pil_crop,
|
||||
"prompt": prompt_ocr,
|
||||
"reading_order": reading_order,
|
||||
}
|
||||
)
|
||||
reading_order += 1
|
||||
except Exception as e:
|
||||
print(f"Error processing bbox (label: {label}): {str(e)}")
|
||||
continue
|
||||
|
||||
if text_table_elements:
|
||||
batch_prompts = []
|
||||
for elem in text_table_elements:
|
||||
decoder_prompt_str = f"<s>{elem['prompt']}<Answer/>"
|
||||
decoder_prompt_tokens = TokensPrompt(
|
||||
prompt_token_ids=processor.tokenizer(
|
||||
decoder_prompt_str, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
)
|
||||
enc_dec_prompt = ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=TextPrompt(
|
||||
prompt=encoder_prompt, multi_modal_data={"image": elem["crop"]}
|
||||
),
|
||||
decoder_prompt=decoder_prompt_tokens,
|
||||
)
|
||||
batch_prompts.append(enc_dec_prompt)
|
||||
batch_outputs = llm.generate(prompts=batch_prompts, sampling_params=sampling_params)
|
||||
for i, output in enumerate(batch_outputs):
|
||||
text_table_elements[i]["text"] = output.outputs[0].text.strip()
|
||||
|
||||
print("------" * 8)
|
||||
text_table_elements.sort(key=lambda x: x["reading_order"])
|
||||
for elem in text_table_elements:
|
||||
print(elem.get("text", ""))
|
@ -1,195 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Demonstrate prompting of text-to-text
|
||||
encoder/decoder models, specifically BART and mBART.
|
||||
|
||||
This script is refactored to allow model selection via command-line arguments.
|
||||
|
||||
NOTE: This example is not yet supported in V1.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.inputs import (
|
||||
ExplicitEncoderDecoderPrompt,
|
||||
TextPrompt,
|
||||
TokensPrompt,
|
||||
zip_enc_dec_prompts,
|
||||
)
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
"""
|
||||
Holds the configuration for a specific model, including its
|
||||
HuggingFace ID and the prompts to use for the demo.
|
||||
"""
|
||||
|
||||
model_id: str
|
||||
encoder_prompts: list
|
||||
decoder_prompts: list
|
||||
hf_overrides: Optional[dict] = None
|
||||
|
||||
|
||||
def get_bart_config() -> ModelRequestData:
|
||||
"""
|
||||
Returns the configuration for facebook/bart-large-cnn.
|
||||
This uses the exact test cases from the original script.
|
||||
"""
|
||||
encoder_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"An encoder prompt",
|
||||
]
|
||||
decoder_prompts = [
|
||||
"A decoder prompt",
|
||||
"Another decoder prompt",
|
||||
]
|
||||
return ModelRequestData(
|
||||
model_id="facebook/bart-large-cnn",
|
||||
encoder_prompts=encoder_prompts,
|
||||
decoder_prompts=decoder_prompts,
|
||||
)
|
||||
|
||||
|
||||
def get_mbart_config() -> ModelRequestData:
|
||||
"""
|
||||
Returns the configuration for facebook/mbart-large-en-ro.
|
||||
This uses prompts suitable for an English-to-Romanian translation task.
|
||||
"""
|
||||
encoder_prompts = [
|
||||
"The quick brown fox jumps over the lazy dog.",
|
||||
"How are you today?",
|
||||
]
|
||||
decoder_prompts = ["", ""]
|
||||
hf_overrides = {"architectures": ["MBartForConditionalGeneration"]}
|
||||
return ModelRequestData(
|
||||
model_id="facebook/mbart-large-en-ro",
|
||||
encoder_prompts=encoder_prompts,
|
||||
decoder_prompts=decoder_prompts,
|
||||
hf_overrides=hf_overrides,
|
||||
)
|
||||
|
||||
|
||||
MODEL_GETTERS = {
|
||||
"bart": get_bart_config,
|
||||
"mbart": get_mbart_config,
|
||||
}
|
||||
|
||||
|
||||
def create_all_prompt_types(
|
||||
encoder_prompts_raw: list,
|
||||
decoder_prompts_raw: list,
|
||||
tokenizer,
|
||||
) -> list:
|
||||
"""
|
||||
Generates a list of diverse prompt types for demonstration.
|
||||
This function is generic and uses the provided raw prompts
|
||||
to create various vLLM input objects.
|
||||
"""
|
||||
text_prompt_raw = encoder_prompts_raw[0]
|
||||
text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)])
|
||||
tokens_prompt = TokensPrompt(
|
||||
prompt_token_ids=tokenizer.encode(
|
||||
encoder_prompts_raw[2 % len(encoder_prompts_raw)]
|
||||
)
|
||||
)
|
||||
|
||||
decoder_tokens_prompt = TokensPrompt(
|
||||
prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0])
|
||||
)
|
||||
single_prompt_examples = [
|
||||
text_prompt_raw,
|
||||
text_prompt,
|
||||
tokens_prompt,
|
||||
]
|
||||
explicit_pair_examples = [
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=text_prompt_raw,
|
||||
decoder_prompt=decoder_tokens_prompt,
|
||||
),
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=text_prompt,
|
||||
decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)],
|
||||
),
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=tokens_prompt,
|
||||
decoder_prompt=text_prompt,
|
||||
),
|
||||
]
|
||||
zipped_prompt_list = zip_enc_dec_prompts(
|
||||
encoder_prompts_raw,
|
||||
decoder_prompts_raw,
|
||||
)
|
||||
return single_prompt_examples + explicit_pair_examples + zipped_prompt_list
|
||||
|
||||
|
||||
def create_sampling_params() -> SamplingParams:
|
||||
"""Create a sampling params object."""
|
||||
return SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
min_tokens=0,
|
||||
max_tokens=30,
|
||||
)
|
||||
|
||||
|
||||
def print_outputs(outputs: list):
|
||||
"""Formats and prints the generation outputs."""
|
||||
print("-" * 80)
|
||||
for i, output in enumerate(outputs):
|
||||
prompt = output.prompt
|
||||
encoder_prompt = output.encoder_prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Output {i + 1}:")
|
||||
print(f"Encoder Prompt: {encoder_prompt!r}")
|
||||
print(f"Decoder Prompt: {prompt!r}")
|
||||
print(f"Generated Text: {generated_text!r}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Main execution function."""
|
||||
model_key = args.model
|
||||
if model_key not in MODEL_GETTERS:
|
||||
raise ValueError(
|
||||
f"Unknown model: {model_key}. "
|
||||
f"Available models: {list(MODEL_GETTERS.keys())}"
|
||||
)
|
||||
config_getter = MODEL_GETTERS[model_key]
|
||||
model_config = config_getter()
|
||||
|
||||
print(f"🚀 Running demo for model: {model_config.model_id}")
|
||||
llm = LLM(
|
||||
model=model_config.model_id,
|
||||
dtype="float",
|
||||
hf_overrides=model_config.hf_overrides,
|
||||
)
|
||||
tokenizer = llm.llm_engine.get_tokenizer_group()
|
||||
prompts = create_all_prompt_types(
|
||||
encoder_prompts_raw=model_config.encoder_prompts,
|
||||
decoder_prompts_raw=model_config.decoder_prompts,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
sampling_params = create_sampling_params()
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
print_outputs(outputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="A flexible demo for vLLM encoder-decoder models."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
"-m",
|
||||
type=str,
|
||||
default="bart",
|
||||
choices=MODEL_GETTERS.keys(),
|
||||
help="The short name of the model to run.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -13,8 +13,6 @@ from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, EngineArgs, PromptType, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -23,113 +21,6 @@ class ModelRequestData(NamedTuple):
|
||||
prompts: Sequence[PromptType]
|
||||
|
||||
|
||||
def run_donut():
|
||||
engine_args = EngineArgs(
|
||||
model="naver-clova-ix/donut-base-finetuned-docvqa",
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
dtype="float16",
|
||||
hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
|
||||
)
|
||||
|
||||
# The input image size for donut-base-finetuned-docvqa is 2560 x 1920,
|
||||
# and the patch_size is 4 x 4.
|
||||
# Therefore, the initial number of patches is:
|
||||
# Height: 1920 / 4 = 480 patches
|
||||
# Width: 2560 / 4 = 640 patches
|
||||
# The Swin model uses a staged downsampling approach,
|
||||
# defined by the "depths": [2, 2, 14, 2] configuration.
|
||||
# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
|
||||
# which halves the feature map's dimensions (dividing both height and width by 2).
|
||||
# Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320.
|
||||
# Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160.
|
||||
# Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80.
|
||||
# Because vLLM needs to fill the image features with an encoder_prompt,
|
||||
# and the encoder_prompt will have `<pad>` tokens added when tokenized,
|
||||
# we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799.
|
||||
prompts = [
|
||||
{
|
||||
"encoder_prompt": {
|
||||
"prompt": "".join(["$"] * 4799),
|
||||
"multi_modal_data": {
|
||||
"image": fetch_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
|
||||
) # noqa: E501
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "<s_docvqa><s_question>What time is the coffee break?</s_question><s_answer>", # noqa: E501
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_florence2():
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
dtype="half",
|
||||
)
|
||||
|
||||
prompts = [
|
||||
{ # implicit prompt with task token
|
||||
"prompt": "<DETAILED_CAPTION>",
|
||||
"multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
|
||||
},
|
||||
{ # explicit encoder/decoder prompt
|
||||
"encoder_prompt": {
|
||||
"prompt": "Describe in detail what is shown in the image.",
|
||||
"multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
|
||||
},
|
||||
"decoder_prompt": "",
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_mllama():
|
||||
engine_args = EngineArgs(
|
||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
dtype="half",
|
||||
)
|
||||
|
||||
prompts = [
|
||||
{ # Implicit prompt
|
||||
"prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501
|
||||
"multi_modal_data": {
|
||||
"image": ImageAsset("stop_sign").pil_image,
|
||||
},
|
||||
},
|
||||
{ # Explicit prompt
|
||||
"encoder_prompt": {
|
||||
"prompt": "<|image|>",
|
||||
"multi_modal_data": {
|
||||
"image": ImageAsset("stop_sign").pil_image,
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_whisper():
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@ -166,9 +57,6 @@ def run_whisper():
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"donut": run_donut,
|
||||
"florence2": run_florence2,
|
||||
"mllama": run_mllama,
|
||||
"whisper": run_whisper,
|
||||
}
|
||||
|
||||
@ -182,7 +70,7 @@ def parse_args():
|
||||
"--model-type",
|
||||
"-m",
|
||||
type=str,
|
||||
default="mllama",
|
||||
default="whisper",
|
||||
choices=model_example_map.keys(),
|
||||
help='Huggingface "model_type".',
|
||||
)
|
||||
|
@ -204,28 +204,6 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Florence2
|
||||
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Fuyu
|
||||
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1008,44 +986,6 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# LLama 3.2
|
||||
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# Note: The default setting of max_num_seqs (256) and
|
||||
# max_model_len (131072) for this model may cause OOM.
|
||||
# You may lower either to run this example on lower-end GPUs.
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "image"}, {"type": "text", "text": question}],
|
||||
}
|
||||
]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=False
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Molmo
|
||||
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1665,7 +1605,6 @@ model_example_map = {
|
||||
"command_a_vision": run_command_a_vision,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"ernie45_vl": run_ernie45_vl,
|
||||
"florence2": run_florence2,
|
||||
"fuyu": run_fuyu,
|
||||
"gemma3": run_gemma3,
|
||||
"gemma3n": run_gemma3n,
|
||||
@ -1691,7 +1630,6 @@ model_example_map = {
|
||||
"minicpmv": run_minicpmv,
|
||||
"minimax_vl_01": run_minimax_vl_01,
|
||||
"mistral3": run_mistral3,
|
||||
"mllama": run_mllama,
|
||||
"molmo": run_molmo,
|
||||
"nemotron_vl": run_nemotron_vl,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
|
@ -637,26 +637,6 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
img_prompt = "Given the first image <|image|> and the second image<|image|>"
|
||||
prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
@ -1253,7 +1233,6 @@ model_example_map = {
|
||||
"llava-next": load_llava_next,
|
||||
"llava-onevision": load_llava_onevision,
|
||||
"mistral3": load_mistral3,
|
||||
"mllama": load_mllama,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
"ovis": load_ovis,
|
||||
"ovis2_5": load_ovis2_5,
|
||||
|
@ -3,15 +3,12 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
||||
STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
from vllm.core.block_manager import SelfAttnBlockSpaceManager
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.sequence import Logprob, SequenceStatus
|
||||
from vllm.utils import chunk_list
|
||||
|
||||
from ..utils import (create_dummy_prompt, create_seq_group,
|
||||
create_seq_group_encoder_decoder)
|
||||
from ..utils import create_dummy_prompt, create_seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@ -58,156 +55,6 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_seq_group_encoder_decoder(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
)
|
||||
num_watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
|
||||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
|
||||
# the current implementation assumes all seqs are new prompts / don't have
|
||||
# different output lens.
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
|
||||
for bdx, num_prompt_blocks in enumerate(
|
||||
range(1, num_gpu_blocks - num_output_blocks)):
|
||||
num_cross_blocks_per_seq = num_prompt_blocks
|
||||
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id=str(bdx))
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
can_allocate_result = block_manager.can_allocate(seq_group)
|
||||
|
||||
num_required_blocks = num_prompt_blocks + \
|
||||
num_output_blocks + \
|
||||
num_cross_blocks_per_seq
|
||||
|
||||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
|
||||
assert can_allocate_result == AllocStatus.NEVER
|
||||
elif num_gpu_blocks >= num_required_blocks:
|
||||
assert can_allocate_result == AllocStatus.OK
|
||||
else:
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
'''
|
||||
SWA short for Sliding Window Attention.
|
||||
|
||||
At time of writing block manager does not support SWA.
|
||||
|
||||
However even when SWA is implemented for block manager,
|
||||
there will still most likely be a separate workstream required
|
||||
to enable SWA for encoder/decoder models.
|
||||
|
||||
Therefore this test enforces that one of the following cases
|
||||
hold true:
|
||||
1. Block manager does not support SWA at all (true at time of writing)
|
||||
2. Block manager fails with NotImplementError when SWA is enabled
|
||||
AND a SequenceGroup with an encoder sequence (i.e. in support of an
|
||||
encoder/decoder model) is passed into can_allocate() as an argument
|
||||
|
||||
The setup for this test is stripped down version of
|
||||
test_can_allocate_seq_group_encoder_decoder()
|
||||
'''
|
||||
|
||||
with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
sliding_window=5 # SWA
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
block_manager.can_allocate(seq_group)
|
||||
|
||||
# Assert that either
|
||||
# 1. Block manager constructor fails with assertion that sliding window
|
||||
# is not yet supported (most likely near-term outcome at time of
|
||||
# writing), or
|
||||
# 2. can_allocate() fails with NotImplementedError due to combination of
|
||||
# encoder/decoder and sliding window attention
|
||||
if isinstance(exc_info.value, NotImplementedError):
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
|
||||
elif isinstance(exc_info.value, AssertionError):
|
||||
assert str(exc_info.value) == "Sliding window not yet supported"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
|
||||
block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
enable_caching=True # Prefix cache
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
# Assert that either can_allocate() fails with NotImplementedError
|
||||
# due to combination of encoder/decoder and prefix cache
|
||||
with pytest.raises(NotImplementedError) as exc_info:
|
||||
block_manager.can_allocate(seq_group)
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
|
||||
|
@ -1,105 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
|
||||
get_sequence_groups, schedule_and_update_computed_tokens)
|
||||
|
||||
|
||||
def test_scheduler_schedule_simple_encoder_decoder():
|
||||
'''
|
||||
Test basic scheduler functionality in the context
|
||||
of an encoder/decoder model. Focus on testing
|
||||
enc/dec-specific functionality sense tests already
|
||||
exist for decoder-only functionality
|
||||
|
||||
Test behavior:
|
||||
* Construct Scheduler
|
||||
* Construct dummy encoder/decoder sequence groups
|
||||
* Add dummy seq groups to scheduler backlog
|
||||
* Schedule the next seq group & validate:
|
||||
* Cross-attn block tables
|
||||
* Updated states of seq groups
|
||||
* Number of batched tokens
|
||||
* Number of blocks to copy/swap-in/swap-out
|
||||
* Number of scheduled seq groups
|
||||
* Repeat for both prefill- and decode-phase
|
||||
* Abort scheduled seq groups
|
||||
* Assert that aborted seq groups no longer appear in
|
||||
cross-attention block table
|
||||
'''
|
||||
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: list[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
req_id_list = []
|
||||
for i in range(num_seq_group):
|
||||
req_id = str(i)
|
||||
req_id_list.append(req_id)
|
||||
_, _, seq_group = create_dummy_prompt_encoder_decoder(
|
||||
req_id, block_size, block_size, block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prefill.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# registered with the block manager
|
||||
assert all([(req_id in scheduler.block_manager.cross_block_tables)
|
||||
for req_id in req_id_list])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate number of batched tokens
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups decode.
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group metadata includes encoder attention
|
||||
# and cross-attention metadata
|
||||
assert all([
|
||||
not ((seq_group_meta.encoder_seq_data is None) or
|
||||
(seq_group_meta.cross_block_table is None))
|
||||
for seq_group_meta in seq_group_meta_list
|
||||
])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate there is one batched token per seq group
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate that all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Abort sequences
|
||||
for req_id in req_id_list:
|
||||
scheduler.abort_seq_group(req_id)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# NO LONGER registered with the block manager
|
||||
assert req_id not in scheduler.block_manager.cross_block_tables
|
@ -242,9 +242,6 @@ MULTIMODAL_MODELS = {
|
||||
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
|
||||
# [Encoder-decoder]
|
||||
# TODO: Implement PP
|
||||
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
@ -1,131 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""E2E tests to verify the correctness of the encoder-decoder framework
|
||||
|
||||
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
||||
global_force_attn_backend_context_manager)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import DecoderPromptType
|
||||
from ..models.utils import check_logprobs_close
|
||||
|
||||
LIST_ENC_DEC_SUPPORTED_BACKENDS = [
|
||||
_Backend.XFORMERS, _Backend.FLASH_ATTN, None
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "</s>"
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
hf_output_str = "<s>" + hf_output_str
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache():
|
||||
"""Fixture to clear backend cache before each test."""
|
||||
_cached_get_attn_backend.cache_clear() # Clear the cache
|
||||
yield # This allows the test to run
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cpu(),
|
||||
reason="CPU backend is not currently supported with encoder/decoder models"
|
||||
)
|
||||
@pytest.mark.skip(reason="bart not supported in V1")
|
||||
def test_encoder_decoder_e2e(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
enforce_eager: bool,
|
||||
attn_backend: _Backend,
|
||||
) -> None:
|
||||
'''
|
||||
End-to-End (E2E) test for the encoder-decoder framework.
|
||||
This test evaluates the encoder-decoder functionality using the BART
|
||||
model. We compare the outputs of the Hugging Face and vLLM
|
||||
implementations to ensure that both implementations produce consistent
|
||||
and correct results.
|
||||
'''
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
test_case_prompts = example_encoder_decoder_prompts[
|
||||
decoder_prompt_type]
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
test_case_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
enforce_eager=enforce_eager) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
test_case_prompts, max_tokens, num_logprobs)
|
||||
|
||||
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
|
||||
else 0)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
@ -1,56 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "facebook/bart-base"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.skip(reason="bart is not yet supported in V1")
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=2, total_tokens=7)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 1
|
@ -20,7 +20,6 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
||||
parse_chat_messages_futures,
|
||||
resolve_chat_template_content_format,
|
||||
resolve_hf_chat_template)
|
||||
from vllm.entrypoints.llm import apply_hf_chat_template
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
|
||||
encode_video_base64)
|
||||
@ -38,7 +37,6 @@ QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
|
||||
MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
|
||||
HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
@ -125,27 +123,6 @@ def qwen25omni_tokenizer():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mllama_model_config():
|
||||
return ModelConfig(
|
||||
MLLAMA_MODEL_ID,
|
||||
runner="generate",
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mllama_tokenizer():
|
||||
return TokenizerGroup(
|
||||
MLLAMA_MODEL_ID,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def mistral_model_config():
|
||||
return ModelConfig(
|
||||
@ -2249,180 +2226,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
)
|
||||
|
||||
|
||||
### Mllama currently wraps images / texts as interleaved dictionaries
|
||||
def test_mllama_single_image(
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that a single image is parsed correctly mllama."""
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of this image is:"
|
||||
},
|
||||
{
|
||||
"image_url": image_url
|
||||
},
|
||||
],
|
||||
}],
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of this image is:"
|
||||
},
|
||||
{
|
||||
"type": "image"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
|
||||
def test_mllama_interleaved_images(
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that multiple image are parsed as interleaved dicts."""
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the first image is:",
|
||||
},
|
||||
{
|
||||
"image_url": image_url
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the second image is:",
|
||||
},
|
||||
{
|
||||
"image_url": image_url
|
||||
},
|
||||
],
|
||||
}],
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the first image is:"
|
||||
},
|
||||
{
|
||||
"type": "image"
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the second image is:"
|
||||
},
|
||||
{
|
||||
"type": "image"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID])
|
||||
def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
"""Checks end to end hf alignment for multimodal [image] parsing."""
|
||||
|
||||
def get_conversation(is_hf: bool):
|
||||
img_part = {"type": "image_url", "image_url": {"url": image_url}}
|
||||
if is_hf:
|
||||
img_part = {"type": "image"}
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the first image is:",
|
||||
},
|
||||
img_part,
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the second image is:",
|
||||
},
|
||||
img_part,
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What animal is in the first image?",
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
# Build a config for the model
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
runner="generate",
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
},
|
||||
)
|
||||
|
||||
# Build the tokenizer group and grab the underlying tokenizer
|
||||
tokenizer_group = TokenizerGroup(
|
||||
model,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = tokenizer_group.tokenizer
|
||||
|
||||
# Build and parse a conversation with {"type": "image"} using the tokenizer
|
||||
hf_conversation = get_conversation(is_hf=True)
|
||||
hf_result = tokenizer.apply_chat_template(
|
||||
hf_conversation,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
# Now parse with vLLMs chat utils & apply the template
|
||||
vllm_conversation = get_conversation(is_hf=False)
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
vllm_conversation,
|
||||
model_config,
|
||||
tokenizer_group,
|
||||
content_format="openai",
|
||||
)
|
||||
|
||||
vllm_result = apply_hf_chat_template(
|
||||
tokenizer=tokenizer,
|
||||
conversation=conversation,
|
||||
chat_template=None,
|
||||
model_config=model_config,
|
||||
tools=None,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
assert hf_result == vllm_result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
@ -2486,7 +2289,6 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
||||
(QWEN25VL_MODEL_ID, "openai"),
|
||||
(ULTRAVOX_MODEL_ID, "string"),
|
||||
(QWEN2AUDIO_MODEL_ID, "openai"),
|
||||
(MLLAMA_MODEL_ID, "openai"),
|
||||
(LLAMA_GUARD_MODEL_ID, "openai")],
|
||||
)
|
||||
# yapf: enable
|
||||
@ -2545,7 +2347,6 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
[("Salesforce/blip2-opt-2.7b", "string"),
|
||||
("facebook/chameleon-7b", "string"),
|
||||
("deepseek-ai/deepseek-vl2-tiny", "string"),
|
||||
("microsoft/Florence-2-base", "string"),
|
||||
("adept/fuyu-8b", "string"),
|
||||
("google/paligemma-3b-mix-224", "string"),
|
||||
("Qwen/Qwen-VL", "string"),
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,222 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
|
||||
HfRunner, VllmRunner)
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "</s>"
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
hf_output_str = "<s>" + hf_output_str
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
'''
|
||||
Test the vLLM BART model for a variety of encoder/decoder input prompts,
|
||||
by validating it against HuggingFace (HF) BART.
|
||||
|
||||
Arguments:
|
||||
|
||||
* hf_runner: HuggingFace (HF) test model runner
|
||||
* vllm_runner: vLLM test model runner
|
||||
* example_encoder_decoder_prompts: test fixture which provides a
|
||||
dictionary of dummy prompts
|
||||
* model: the HF ID of the specific BART variant under test
|
||||
* dtype: the tensor datatype to employ
|
||||
* max_tokens
|
||||
* num_logprobs
|
||||
* decoder_prompt_type: key into the example_encoder_decoder_prompts
|
||||
dictionary; selects specific encoder/decoder
|
||||
prompt scenarios to test
|
||||
|
||||
A note on using HF BART as a baseline for validating vLLM BART,
|
||||
specifically when the decoder prompt is None.
|
||||
|
||||
The HF GenerationMixin's default behavior is to force the first
|
||||
decoded token to be <BOS> if the prompt does not already contain
|
||||
<BOS> (this is accomplished using a logit
|
||||
processor setting.)
|
||||
|
||||
So when we use HF BART as our baseline for comparison, note that
|
||||
when the user provides a request with a None decoder prompt
|
||||
(i.e. a singleton encoder prompt, or else an explicit encoder/
|
||||
decoder prompt with the decoder sub-prompt set to None), HF and
|
||||
vLLM handle this in different ways:
|
||||
|
||||
* HF will (1) tokenize the None prompt as an empty token-list,
|
||||
(2) append <decoder-start-token> to the beginning, yielding
|
||||
[<decoder-start-token>], (3) pass this token list to the model, and
|
||||
then (4) after computing logits during prefill, override the model
|
||||
logits & force <BOS> to be the first generated token.
|
||||
|
||||
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
|
||||
start-token to the beginning, yielding [<decoder-start-token><BOS>],
|
||||
(3) pass these tokens to the model & proceed with generation.
|
||||
|
||||
The net effect is that compared to vLLM, the list of HF *decoded* tokens
|
||||
will contain one more initial <BOS> than the vLLM generated tokens,
|
||||
because vLLM's <BOS> token is injected into the prompt rather than into
|
||||
the generated output. This is in spite of the fact that overall, the
|
||||
complete sequences (prompt + decoded tokens) produced by vLLM will match
|
||||
HF.
|
||||
|
||||
So when we use HF decoded token output to validate vLLM's decoded token
|
||||
output, the testing process must account for the difference in decoded
|
||||
token sequences between vLLM and HF specifically in the
|
||||
decoder-prompt-is-None case.
|
||||
|
||||
One option is to disable the logit processor feature that forces the
|
||||
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
|
||||
the problem entirely. However this is not "normal" BART usage.
|
||||
|
||||
The other option is - only in the decoder-prompt-is-None case - to
|
||||
discard the first decoded token from the HF output before comparing it
|
||||
to vLLM.
|
||||
|
||||
To that end, when testing the scenario where the decoder prompt is None
|
||||
(and only in that one scenario), this test skips the first HF decoded
|
||||
token during the process of validating the vLLM decoded output.
|
||||
'''
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default).
|
||||
|
||||
# Note: currently encoder/decoder models are only compatible with
|
||||
# enforce_eager=True. Normally this is not a problem because
|
||||
# for encoder/decoder models vLLM will
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-existing
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
hf_skip_tokens = (1
|
||||
if decoder_prompt_type == DecoderPromptType.NONE else 0)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("facebook/bart-base",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("facebook/bart-large-cnn"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
@pytest.mark.skip(reason="bart not supported in V1")
|
||||
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
|
||||
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
|
||||
@pytest.mark.skip(reason="bart not supported in V1")
|
||||
def test_models_distributed(hf_runner, vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
distributed_executor_backend, model, dtype,
|
||||
max_tokens, num_logprobs,
|
||||
decoder_prompt_type) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
@ -1,123 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import DecoderPromptType, HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
hf_output_str = output_str + "</s>"
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
prompts: list[dict[str, str]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
'''
|
||||
Test the vLLM mBART model by validating it against HuggingFace (HF).
|
||||
(Docstring content is omitted for brevity)
|
||||
'''
|
||||
|
||||
vllm_prompts = prompts
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
vllm_prompts = [{
|
||||
"encoder_prompt": p['encoder_prompt'],
|
||||
"decoder_prompt": ""
|
||||
} for p in prompts]
|
||||
|
||||
vllm_kwargs = {
|
||||
"hf_overrides": {
|
||||
"architectures": ["MBartForConditionalGeneration"]
|
||||
}
|
||||
}
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
**vllm_kwargs) as vllm_model: # type: ignore
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
vllm_prompts, max_tokens, num_logprobs)
|
||||
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_kwargs["decoder_start_token_id"] = (
|
||||
hf_model.tokenizer.lang_code_to_id["ro_RO"])
|
||||
|
||||
hf_outputs = (
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts, # HF runner still uses the original prompts
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
hf_skip_tokens = 0
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[pytest.param("facebook/mbart-large-en-ro")],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
|
||||
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
@ -1,147 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = ["microsoft/Florence-2-base"]
|
||||
# Florence-2 model repo's tokenizer config is missing some special tokens.
|
||||
# Therefore, we use a converted tokenizer from a forked repo
|
||||
TOKENIZER = "Isotr0py/Florence-2-tokenizer"
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<OD>", # special task token which will output special tokens
|
||||
"cherry_blossom":
|
||||
"Describe in detail what is shown in the image.",
|
||||
})
|
||||
|
||||
|
||||
def get_hf_images_prompts(
|
||||
prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
|
||||
) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
|
||||
prompts, images = [], []
|
||||
for prompt in prompts_:
|
||||
encoder_prompt = prompt["encoder_prompt"]
|
||||
prompts.append(
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=encoder_prompt["prompt"],
|
||||
decoder_prompt=None,
|
||||
))
|
||||
images.append(encoder_prompt["multi_modal_data"]["image"])
|
||||
return prompts, images
|
||||
|
||||
|
||||
def hf_to_vllm_output(hf_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]]):
|
||||
"""Sanitize hf output to be comparable with vllm output."""
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
output_str = output_str.replace("</s>", "").replace("<s>", "")
|
||||
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[list[ExplicitEncoderDecoderPrompt]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=8,
|
||||
tokenizer_name=TOKENIZER,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
skip_special_tokens=False,
|
||||
) for prompts in inputs
|
||||
]
|
||||
|
||||
hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
|
||||
|
||||
with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.lm_head
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, images=images)
|
||||
for prompts, images in hf_inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=1,
|
||||
)
|
||||
|
||||
|
||||
# FIXME: https://github.com/huggingface/transformers/issues/38358
|
||||
@pytest.mark.skip("Model initialization fails")
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, model: str,
|
||||
size_factors: list[int], dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [[
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=TextPrompt(
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": rescale_image_size(image, factor)}),
|
||||
decoder_prompt=None,
|
||||
) for factor in size_factors
|
||||
] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
@ -1,768 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional, overload
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
|
||||
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
||||
global_force_attn_backend_context_manager)
|
||||
from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....quantization.utils import is_quant_method_supported
|
||||
from ....utils import (create_new_process_for_each_test, large_gpu_test,
|
||||
multi_gpu_test)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 3
|
||||
MLLAMA_IMAGE_TOKEN_ID = 128256
|
||||
|
||||
LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|image|><|begin_of_text|>The meaning of the image is",
|
||||
"cherry_blossom":
|
||||
"<|image|><|begin_of_text|>The city is",
|
||||
})
|
||||
|
||||
text_only_prompts = [
|
||||
"The color of the sky is blue but sometimes it can also be",
|
||||
]
|
||||
|
||||
models = [
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
]
|
||||
|
||||
# Indices for inputs
|
||||
TEXT_ONLY = '0'
|
||||
IMAGE_AT_BEG = '1'
|
||||
IMAGE_AT_MIDDLE = '2'
|
||||
TWO_IMAGES = '3'
|
||||
|
||||
# Input tokenized
|
||||
prompt_data = {
|
||||
# Tell me a story
|
||||
TEXT_ONLY: [41551, 757, 264, 3446],
|
||||
# <|image|> What's the content of this image
|
||||
IMAGE_AT_BEG:
|
||||
[MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
|
||||
# Hello <|image|>What' the content of this image
|
||||
IMAGE_AT_MIDDLE:
|
||||
[9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
|
||||
#<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
|
||||
TWO_IMAGES: [
|
||||
MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
|
||||
MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
image_token_id = config.image_token_index
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
hf_output_str = output_str
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def _get_inputs(
|
||||
image_assets: ImageTestAssets,
|
||||
*,
|
||||
size_factors: Optional[list[float]] = None,
|
||||
sizes: Optional[list[tuple[int, int]]] = None,
|
||||
) -> list[tuple[list[str], PromptImageInput]]:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
if size_factors is not None:
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
elif sizes is not None:
|
||||
inputs_per_image = [(
|
||||
[
|
||||
prompt if size is not None else text_only_prompts[0]
|
||||
for size in sizes
|
||||
],
|
||||
[
|
||||
image.resize(size) if size is not None else None
|
||||
for size in sizes
|
||||
],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
if len(sizes) == 0:
|
||||
inputs_per_image.append(
|
||||
(text_only_prompts, [None] * len(text_only_prompts)))
|
||||
else:
|
||||
raise ValueError("You must provide either `size_factors` or `sizes`")
|
||||
|
||||
return inputs_per_image
|
||||
|
||||
|
||||
@overload
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: list[float],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
*,
|
||||
sizes: list[tuple[int, int]],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
...
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: Optional[list[float]] = None,
|
||||
sizes: Optional[list[tuple[int, int]]] = None,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
_get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[tuple[list[str], PromptImageInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=19212, # 3 max size images
|
||||
max_num_seqs=3,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
limit_mm_per_prompt={"image":
|
||||
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
model_kwargs={"device_map": "auto"},
|
||||
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||
hf_outputs_per_image = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||
vllm_outputs_per_image):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, model)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache():
|
||||
"""Fixture to clear backend cache before each test."""
|
||||
_cached_get_attn_backend.cache_clear() # Clear the cache
|
||||
yield # This allows the test to run
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"sizes",
|
||||
[
|
||||
# Text only
|
||||
[],
|
||||
# Single-size
|
||||
[(512, 512)],
|
||||
# Single-size, batched
|
||||
[(512, 512), (512, 512), (512, 512)],
|
||||
# Multi-size, batched
|
||||
[(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
|
||||
(1024, 1024), (512, 1536), (512, 2028)],
|
||||
# Multi-size, batched, including text only
|
||||
[(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
|
||||
(1024, 1024), (512, 1536), (512, 2028), None],
|
||||
# mllama has 8 possible aspect ratios, carefully set the sizes
|
||||
# to cover all of them
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
|
||||
model, sizes, dtype, max_tokens,
|
||||
num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
sizes=sizes,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
|
||||
model, dtype, max_tokens, num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
cherry_blossom = image_assets[1].pil_image
|
||||
|
||||
inputs = [(
|
||||
[
|
||||
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
|
||||
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
|
||||
"<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.", # noqa: E501
|
||||
],
|
||||
[
|
||||
[stop_sign, cherry_blossom],
|
||||
# Images with different sizes.
|
||||
[
|
||||
stop_sign.resize((512, 512)),
|
||||
stop_sign,
|
||||
],
|
||||
[
|
||||
stop_sign,
|
||||
stop_sign.resize((512, 1536)),
|
||||
cherry_blossom.resize((512, 1024)),
|
||||
],
|
||||
])]
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
|
||||
dtype, max_tokens, num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
cherry_blossom = image_assets[1].pil_image
|
||||
|
||||
inputs = [(
|
||||
[
|
||||
"<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
|
||||
"<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, " # noqa: E501
|
||||
"which is a stop sign and which is a cherry blossom?", # noqa: E501
|
||||
],
|
||||
[
|
||||
[stop_sign],
|
||||
[stop_sign, cherry_blossom],
|
||||
])]
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
distributed_executor_backend,
|
||||
model,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=model,
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
def test_bnb_regression(
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
):
|
||||
stop_sign = image_assets[0].pil_image
|
||||
prompts = [
|
||||
{
|
||||
"prompt": "<|begin_of_text|>The content of the image <|image|> is",
|
||||
"multi_modal_data": {
|
||||
"image": stop_sign
|
||||
},
|
||||
},
|
||||
{
|
||||
"prompt":
|
||||
"The color of the sky is blue but sometimes it can also be",
|
||||
},
|
||||
]
|
||||
# Test regression about QKVCrossParallelLinear
|
||||
llm = LLM(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
quantization="bitsandbytes",
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
assert outputs
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_explicit_implicit_prompt(
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
):
|
||||
stop_sign = image_assets[0].pil_image
|
||||
# yapf: disable
|
||||
prompts = [
|
||||
# explicit prompt
|
||||
{
|
||||
"encoder_prompt": {
|
||||
"prompt": "<|image|>",
|
||||
"multi_modal_data": {"image": stop_sign},
|
||||
},
|
||||
"decoder_prompt": {
|
||||
"prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374], # noqa: E501
|
||||
}
|
||||
},
|
||||
{
|
||||
"encoder_prompt": "Not <|image|>",
|
||||
"decoder_prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
|
||||
},
|
||||
# implicit prompt
|
||||
{
|
||||
"prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
|
||||
"multi_modal_data": {"image": stop_sign},
|
||||
},
|
||||
{
|
||||
"prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
|
||||
},
|
||||
]
|
||||
# yapf: enable
|
||||
llm = LLM(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
n_prompts = len(prompts)
|
||||
explicit_outputs = outputs[:n_prompts // 2]
|
||||
implicit_outputs = outputs[n_prompts // 2:]
|
||||
for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
|
||||
assert exp_output.outputs[0].text == imp_output.outputs[0].text
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
|
||||
num_logprobs, attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
|
||||
with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
tensor_parallel_size=1,
|
||||
limit_mm_per_prompt={"image":
|
||||
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
|
||||
|
||||
# Regression tests for https://github.com/vllm-project/vllm/issues/10648
|
||||
|
||||
# Number of groups of image tokens is greater than the number of images
|
||||
# provided (the whitespace between the tags is necessary)
|
||||
prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501
|
||||
image = stop_sign
|
||||
with pytest.raises(ValueError):
|
||||
vllm_model.generate_greedy_logprobs([prompt],
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=[image])
|
||||
|
||||
# Batch of a text-only and image request that requires cross-attention
|
||||
prompts = [
|
||||
"What is the capital of spain?",
|
||||
"Text before the image...<|image|>What is in the image?", # noqa: E501
|
||||
]
|
||||
images = [
|
||||
None,
|
||||
[stop_sign],
|
||||
]
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=images)
|
||||
|
||||
# Test the reverse order too for good measure
|
||||
prompts = [
|
||||
"<|begin_of_text|>Text before the image...<|image|>What is in the image?", # noqa: E501
|
||||
"<|begin_of_text|>Hello!",
|
||||
]
|
||||
images = [
|
||||
[stop_sign],
|
||||
None,
|
||||
]
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=images)
|
||||
|
||||
# Mixed batch with text and images with different numbers of tiles
|
||||
prompts = [
|
||||
"<|begin_of_text|>Hello!",
|
||||
"<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501
|
||||
"<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501
|
||||
]
|
||||
images = [
|
||||
None,
|
||||
[stop_sign],
|
||||
# smaller image must be 2nd for the repro
|
||||
[stop_sign.resize((448, 448))],
|
||||
]
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=images)
|
||||
|
||||
|
||||
class DummyModel:
|
||||
image_token_id = MLLAMA_IMAGE_TOKEN_ID
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize(
|
||||
"input_indices_and_output",
|
||||
# inputs, (cross_attention_mask, kv_range_for_decode)
|
||||
[([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
|
||||
([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
|
||||
([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
|
||||
([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
|
||||
([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
|
||||
((23, 24), [[0, 6], [6, 12]])),
|
||||
([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
|
||||
([TWO_IMAGES], ((18, 12), [[6, 12]])),
|
||||
([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
|
||||
def test_get_cross_attention_mask(input_indices_and_output) -> None:
|
||||
|
||||
input_indices, expected_output = input_indices_and_output
|
||||
|
||||
sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
|
||||
num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
|
||||
if i != TEXT_ONLY]
|
||||
input = torch.cat(sequences)
|
||||
|
||||
seq_lens = [len(s) for s in sequences]
|
||||
|
||||
attn_data = FlashAttentionMetadata(
|
||||
seq_lens=seq_lens,
|
||||
# Dummy values
|
||||
enable_kv_scales_calculation=False,
|
||||
num_prefills=0,
|
||||
num_prefill_tokens=0,
|
||||
num_decode_tokens=0,
|
||||
slot_mapping=0,
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
seq_lens_tensor=0,
|
||||
max_prefill_seq_len=0,
|
||||
max_decode_seq_len=0,
|
||||
context_lens_tensor=None,
|
||||
block_tables=None,
|
||||
use_cuda_graph=False,
|
||||
)
|
||||
|
||||
dummy = DummyModel()
|
||||
|
||||
cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
|
||||
.get_cross_attention_mask(dummy,
|
||||
input,
|
||||
attn_data,
|
||||
num_tiles=num_tiles,
|
||||
num_tokens_per_tile=3,
|
||||
dtype=torch.bfloat16)
|
||||
|
||||
expected_cross_attention_mask, expected_kv_range_for_decode = \
|
||||
expected_output
|
||||
|
||||
assert kv_range_for_decode == expected_kv_range_for_decode
|
||||
if expected_cross_attention_mask is not None:
|
||||
assert cross_attention_mask is not None
|
||||
assert cross_attention_mask.shape == expected_cross_attention_mask
|
||||
else:
|
||||
assert cross_attention_mask is None
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize(
|
||||
"input_indices",
|
||||
[[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
|
||||
[TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
|
||||
[IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
|
||||
def test_get_full_text_row_masked_out_mask(input_indices) -> None:
|
||||
|
||||
sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
|
||||
|
||||
seq_lens = [len(s) for s in sequences]
|
||||
|
||||
num_prefill_tokens = sum(seq_lens)
|
||||
|
||||
# TEXT_ONLY is zero, so it will be masked out,
|
||||
# other instances should not be.
|
||||
encoder_seq_lens = [int(i) for i in input_indices]
|
||||
|
||||
attn_data = FlashAttentionMetadata(
|
||||
seq_lens=seq_lens,
|
||||
encoder_seq_lens=encoder_seq_lens,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
# Dummy values
|
||||
enable_kv_scales_calculation=False,
|
||||
num_prefills=0,
|
||||
num_decode_tokens=0,
|
||||
slot_mapping=0,
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
seq_lens_tensor=0,
|
||||
max_prefill_seq_len=0,
|
||||
max_decode_seq_len=0,
|
||||
context_lens_tensor=None,
|
||||
block_tables=None,
|
||||
use_cuda_graph=False,
|
||||
)
|
||||
|
||||
dummy = DummyModel()
|
||||
|
||||
full_text_row_masked_out_mask = MllamaForConditionalGeneration\
|
||||
.get_full_text_row_masked_out_mask(dummy,
|
||||
attn_data,
|
||||
torch.get_default_device())
|
||||
|
||||
full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
|
||||
full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
|
||||
|
||||
idx = 0
|
||||
assert len(full_text_row_masked_out_mask) == num_prefill_tokens
|
||||
for i, seq_len in enumerate(seq_lens):
|
||||
must_be_masked = input_indices[i] != TEXT_ONLY
|
||||
for _ in range(seq_len):
|
||||
assert full_text_row_masked_out_mask[idx] == must_be_masked, \
|
||||
f"full_text_row_masked_out_mask[{idx}] must be " \
|
||||
f"'{must_be_masked}' "
|
||||
idx += 1
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [
|
||||
([6404], [[4]], [6404]),
|
||||
([0, 6404], [[4]], [6404]),
|
||||
([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
|
||||
([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
|
||||
])
|
||||
def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles,
|
||||
expected) -> None:
|
||||
|
||||
dummy = DummyModel()
|
||||
num_tokens_per_tile = 1601
|
||||
actual_encoder_seq_lens = MllamaForConditionalGeneration \
|
||||
._get_and_validate_encoder_lens(
|
||||
dummy,
|
||||
encoder_seq_lens,
|
||||
num_tiles,
|
||||
num_tokens_per_tile,
|
||||
)
|
||||
assert actual_encoder_seq_lens == expected, \
|
||||
f"Expected {expected} but got {actual_encoder_seq_lens}"
|
@ -167,8 +167,6 @@ def _test_processing_correctness(
|
||||
# incorrect token ids. So we need use `add_special_tokens=False` here
|
||||
# to leave bos_token to be added by the processor.
|
||||
_ADD_SPECIAL_TOKENS_OVERRIDES = {
|
||||
"donut": False,
|
||||
"mllama": False,
|
||||
"ovis": False,
|
||||
"ovis2_5": False,
|
||||
"paligemma": False,
|
||||
@ -278,9 +276,7 @@ def _test_processing_correctness_one(
|
||||
"facebook/chameleon-7b",
|
||||
"CohereLabs/command-a-vision-07-2025",
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"naver-clova-ix/donut-base-finetuned-docvqa",
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
"microsoft/Florence-2-base",
|
||||
"adept/fuyu-8b",
|
||||
"google/gemma-3-4b-it",
|
||||
"google/gemma-3n-E2B-it",
|
||||
@ -305,7 +301,6 @@ def _test_processing_correctness_one(
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
"mispeech/midashenglm-7b",
|
||||
"openbmb/MiniCPM-Llama3-V-2_5",
|
||||
|
@ -1,72 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for mllama's multimodal preprocessing and profiling."""
|
||||
import pytest
|
||||
from transformers import MllamaConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["meta-llama/Llama-3.2-11B-Vision-Instruct"])
|
||||
@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
|
||||
@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
|
||||
def test_profiling(
|
||||
model_id: str,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
):
|
||||
# regression test for https://github.com/vllm-project/vllm/issues/13929
|
||||
from vllm.model_executor.models.mllama import calc_token_per_chunk
|
||||
|
||||
model_config_kwargs = {
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
model_config_kwargs=model_config_kwargs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
mm_config = ctx.get_mm_config()
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
dummy_encoder_data = profiler.get_encoder_dummy_data(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
|
||||
hf_config = ctx.get_hf_config(MllamaConfig)
|
||||
image_size = hf_config.vision_config.image_size
|
||||
encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
|
||||
] * max_num_seqs
|
||||
|
||||
mm_data = processor.apply(
|
||||
prompt=dummy_mm_data.prompt,
|
||||
mm_data=dummy_mm_data.mm_data,
|
||||
hf_processor_mm_kwargs=dict(),
|
||||
)["mm_kwargs"].get_data()
|
||||
|
||||
# Get the actual number of encoder tokens for each sample.
|
||||
# Because attn_metadata.encoder_seq_lens only counts the last
|
||||
# group of images for each sample, which is used to cheat the
|
||||
# block manager to allocate blocks for those images only.
|
||||
# See MllamaMultiModalProcessor for more details.
|
||||
num_tiles = [[t] for t in mm_data.pop("num_tiles")]
|
||||
num_tokens_per_tile = calc_token_per_chunk(image_size)
|
||||
actual_encoder_seq_lens = [
|
||||
sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
|
||||
]
|
||||
|
||||
# simulate mllama image-present prefill.
|
||||
for actual_len, last_group_len in zip(actual_encoder_seq_lens,
|
||||
encoder_seq_lens):
|
||||
assert actual_len >= last_group_len
|
@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides
|
||||
|
||||
ARCH_TO_SKIP = {
|
||||
"MolmoForCausalLM": "incompatible requirements",
|
||||
"Florence2ForConditionalGeneration": "not supported in V1",
|
||||
}
|
||||
ARCH_NEEDS_EXTRAS = [
|
||||
"InternVLChatModel",
|
||||
|
@ -354,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
|
||||
trust_remote_code=True),
|
||||
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
|
||||
# [Encoder-decoder]
|
||||
"BartModel": _HfExamplesInfo("facebook/bart-base"),
|
||||
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
|
||||
"MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501
|
||||
hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501
|
||||
}
|
||||
|
||||
_EMBEDDING_EXAMPLE_MODELS = {
|
||||
@ -583,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
is_available_online=False,
|
||||
),
|
||||
# [Encoder-decoder]
|
||||
"DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa", # noqa: E501
|
||||
hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"}, # noqa: E501
|
||||
extras={"dolphin": "ByteDance/Dolphin"}), # noqa: E501
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
|
||||
trust_remote_code=True), # noqa: E501
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||
# [Cross-encoder]
|
||||
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
|
||||
|
@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
|
||||
# L4 supports FA3.
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
|
||||
if model_arch == "Florence2ForConditionalGeneration":
|
||||
# An encoder-decoder model that's V0-only. Just skip it
|
||||
# since V0 is about to be removed.
|
||||
pytest.skip("Skipping Florence2ForConditionalGeneration")
|
||||
if model_arch == "WhisperForConditionalGeneration":
|
||||
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
LLM(
|
||||
|
@ -50,7 +50,6 @@ def test_registry_imports(model_arch):
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
|
||||
("LlamaForCausalLM", False, False, False),
|
||||
("MllamaForConditionalGeneration", True, False, False),
|
||||
("LlavaForConditionalGeneration", True, True, False),
|
||||
("BertForSequenceClassification", False, False, True),
|
||||
("RobertaForSequenceClassification", False, False, True),
|
||||
|
@ -299,9 +299,8 @@ def test_rope_customization():
|
||||
reason="Encoder Decoder models not supported on ROCm.")
|
||||
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
|
||||
("facebook/opt-125m", False),
|
||||
("facebook/bart-base", True),
|
||||
("openai/whisper-tiny", True),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", False),
|
||||
("meta-llama/Llama-3.2-11B-Vision", True),
|
||||
])
|
||||
def test_is_encoder_decoder(model_id, is_encoder_decoder):
|
||||
config = ModelConfig(model_id)
|
||||
|
@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention():
|
||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
|
||||
|
||||
|
||||
def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
|
||||
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
from vllm.attention import Attention, AttentionType
|
||||
|
||||
# example from bart
|
||||
ctx = {
|
||||
'encoder.layers.0.self_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
|
||||
'decoder.layers.0.encoder_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
|
||||
'decoder.layers.0.self_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
|
||||
}
|
||||
|
||||
kv_cache = [
|
||||
torch.zeros((1, )),
|
||||
]
|
||||
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
|
||||
|
||||
bind_kv_cache(ctx, [kv_cache])
|
||||
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
|
||||
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||
|
||||
|
||||
def test_bind_kv_cache_pp():
|
||||
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
|
||||
# this test runs with 1 GPU, but we simulate 2 GPUs
|
||||
|
@ -9,24 +9,9 @@ from vllm import LLM
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
|
||||
UNSUPPORTED_MODELS_V1 = [
|
||||
"facebook/bart-large-cnn", # encoder decoder
|
||||
]
|
||||
|
||||
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
|
||||
def test_reject_unsupported_models(monkeypatch, model):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
args = AsyncEngineArgs(model=model)
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
_ = args.create_engine_config()
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
def test_reject_bad_config(monkeypatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
@ -77,12 +62,6 @@ def test_enable_by_default_fallback(monkeypatch):
|
||||
assert envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should fall back to V0 for supported model.
|
||||
_ = AsyncEngineArgs(
|
||||
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
|
||||
assert not envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
def test_v1_llm_by_default(monkeypatch):
|
||||
with monkeypatch.context() as m:
|
||||
|
@ -1,648 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
|
||||
from vllm.utils import make_tensor_with_pad
|
||||
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
|
||||
|
||||
BATCH_SIZES = [1, 4, 16, 64, 256]
|
||||
|
||||
|
||||
def _create_model_runner(model: str, *args,
|
||||
**kwargs) -> EncoderDecoderModelRunner:
|
||||
engine_args = EngineArgs(model, *args, **kwargs)
|
||||
engine_config = engine_args.create_engine_config()
|
||||
model_runner = EncoderDecoderModelRunner(
|
||||
vllm_config=engine_config,
|
||||
is_driver_worker=True,
|
||||
)
|
||||
return model_runner
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="CPU backend is currently "
|
||||
"unsupported for encoder/ "
|
||||
"decoder models")
|
||||
def test_empty_seq_group():
|
||||
"""Verify prepare prompt and decode returns empty output
|
||||
for empty seq group list"""
|
||||
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=True,
|
||||
)
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
model_input = model_runner._prepare_model_input_tensors(
|
||||
seq_group_metadata_list)
|
||||
(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
attn_metadata,
|
||||
return_seq_lens,
|
||||
) = (
|
||||
model_input.input_tokens,
|
||||
model_input.input_positions,
|
||||
model_input.encoder_input_tokens,
|
||||
model_input.encoder_input_positions,
|
||||
model_input.attn_metadata,
|
||||
model_input.seq_lens,
|
||||
)
|
||||
assert input_tokens is None
|
||||
assert input_positions is None
|
||||
assert encoder_input_tokens is None
|
||||
assert encoder_input_positions is None
|
||||
assert attn_metadata is None
|
||||
assert return_seq_lens is None
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="CPU backend is currently "
|
||||
"unsupported for encoder/ "
|
||||
"decoder models")
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
def test_prepare_prompt(batch_size):
|
||||
'''
|
||||
Test the ability of the encoder/decoder model runner subclass to
|
||||
produce prefill-phase model inputs & attention metadata.
|
||||
|
||||
Test behavior:
|
||||
|
||||
* Instantiate BART base model & enc/dec model runner
|
||||
* Construct sequence-group metadata for dummy prompts
|
||||
* Test that encoder attention, decoder self-attention,
|
||||
and encoder/decoder cross-attention inputs are correct
|
||||
|
||||
Arguments:
|
||||
|
||||
* batch_size
|
||||
* backend_name: The attention backend under test
|
||||
* enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
|
||||
'''
|
||||
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
seq_lens: list[int] = []
|
||||
encoder_seq_lens: list[int] = []
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
block_tables = {0: [1]}
|
||||
cross_block_table = [2]
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
seq_len = i % (model_runner.block_size - 1) + 1
|
||||
seq_lens.append(seq_len)
|
||||
seq_data = SequenceData.from_seqs(range(seq_len))
|
||||
encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
|
||||
encoder_seq_lens.append(encoder_seq_len)
|
||||
encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
|
||||
seq_group_metadata = SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: seq_data},
|
||||
sampling_params=SamplingParams(temperature=0),
|
||||
block_tables=block_tables,
|
||||
encoder_seq_data=encoder_seq_data,
|
||||
cross_block_table=cross_block_table,
|
||||
)
|
||||
assert seq_group_metadata.token_chunk_size == seq_data.get_len()
|
||||
seq_group_metadata_list.append(seq_group_metadata)
|
||||
|
||||
# Build
|
||||
# * Decoder model inputs
|
||||
# * Decoder self-attention KV caching data structures
|
||||
# * Encoder model inputs
|
||||
# * Encoder/decoder cross-attention KV caching data structures
|
||||
model_input = model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
|
||||
input_tokens = model_input.input_tokens
|
||||
input_positions = model_input.input_positions
|
||||
attn_metadata = model_input.attn_metadata
|
||||
return_seq_lens = model_input.seq_lens
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
encoder_input_tokens = model_input.encoder_input_tokens
|
||||
encoder_input_positions = model_input.encoder_input_positions
|
||||
cross_slot_mapping = attn_metadata.cross_slot_mapping
|
||||
assert return_seq_lens == seq_lens
|
||||
assert len(slot_mapping) == len(input_tokens)
|
||||
assert len(cross_slot_mapping) == len(encoder_input_tokens)
|
||||
|
||||
# Verify input metadata is correct for prompts.
|
||||
# - Decoder attention metadata
|
||||
device = model_runner.device
|
||||
assert attn_metadata.num_prefills > 0
|
||||
assert attn_metadata.num_decode_tokens == 0
|
||||
assert torch.equal(attn_metadata.seq_lens_tensor,
|
||||
torch.tensor(seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.seq_lens == seq_lens
|
||||
assert attn_metadata.max_prefill_seq_len == max(seq_lens)
|
||||
assert attn_metadata.max_decode_seq_len == 0
|
||||
# - Encoder attention metadata
|
||||
assert attn_metadata.encoder_seq_lens == encoder_seq_lens
|
||||
assert torch.equal(
|
||||
attn_metadata.encoder_seq_lens_tensor,
|
||||
torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
|
||||
assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
|
||||
|
||||
# Test decoder subquery start locs.
|
||||
start_idx = 0
|
||||
start_loc = [start_idx]
|
||||
for seq_len in seq_lens:
|
||||
start_idx += seq_len
|
||||
start_loc.append(start_idx)
|
||||
assert torch.equal(
|
||||
attn_metadata.query_start_loc,
|
||||
torch.tensor(start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
|
||||
# Test decoder seq start locs & context lengths
|
||||
|
||||
assert torch.equal(
|
||||
attn_metadata.seq_start_loc,
|
||||
torch.tensor(start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.context_lens_tensor,
|
||||
torch.zeros(attn_metadata.context_lens_tensor.shape[0],
|
||||
dtype=torch.int,
|
||||
device=device),
|
||||
)
|
||||
|
||||
# Verify block tables are correct for prompts
|
||||
# - Decoder self-attention
|
||||
expected = torch.tensor(
|
||||
[[] for _ in range(len(seq_group_metadata_list))],
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device,
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.block_tables,
|
||||
expected,
|
||||
)
|
||||
# - Encoder/decoder cross-attention
|
||||
assert torch.equal(
|
||||
attn_metadata.cross_block_tables,
|
||||
expected,
|
||||
)
|
||||
|
||||
# Cuda graph should not be used for prefill.
|
||||
assert attn_metadata.use_cuda_graph is False
|
||||
|
||||
# Verify the lengths of input tokens & positions
|
||||
# - Decoder
|
||||
assert len(input_tokens) == sum(seq_lens)
|
||||
assert len(input_positions) == sum(seq_lens)
|
||||
# -- An indirect check that model_input.input_tokens
|
||||
# and model_input.input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
)
|
||||
# - Encoder
|
||||
assert len(encoder_input_tokens) == sum(encoder_seq_lens)
|
||||
# -- An indirect check that model_input.encoder_input_tokens
|
||||
# and model_input.encoder_input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
)
|
||||
|
||||
# Test that vLLM sampling infrastructure chooses the correct
|
||||
# sequence positions at which to sample (i.e. the end of
|
||||
# each sequence) in the prefill phase
|
||||
|
||||
expected_selected_token_indices = []
|
||||
selected_token_start_idx = 0
|
||||
for seq_len in seq_lens:
|
||||
# Compute the index offset of the final token in each
|
||||
# prompt (recall that the prompts are concatenated)
|
||||
expected_selected_token_indices.append(selected_token_start_idx +
|
||||
seq_len - 1)
|
||||
selected_token_start_idx += seq_len
|
||||
|
||||
sampling_metadata = model_input.sampling_metadata
|
||||
actual = sampling_metadata.selected_token_indices
|
||||
expected = torch.tensor(
|
||||
expected_selected_token_indices,
|
||||
device=actual.device,
|
||||
dtype=actual.dtype,
|
||||
)
|
||||
assert torch.equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="CPU backend is currently "
|
||||
"unsupported for encoder/ "
|
||||
"decoder models")
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
|
||||
def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
|
||||
'''
|
||||
Test the ability of the encoder/decoder model runner subclass to
|
||||
produce decode-phase model inputs & attention metadata.
|
||||
|
||||
Test behavior:
|
||||
|
||||
* Instantiate BART base model & enc/dec model runner
|
||||
* Construct sequence-group metadata for dummy prompts
|
||||
* Test that encoder attention, decoder self-attention,
|
||||
and encoder/decoder cross-attention inputs are correct
|
||||
|
||||
Arguments:
|
||||
|
||||
* batch_size
|
||||
* multiple_seqs_per_seq_group
|
||||
* backend_name: The attention backend under test
|
||||
* enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
|
||||
'''
|
||||
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
seq_lens: list[int] = []
|
||||
encoder_seq_lens: list[int] = []
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
block_tables = {
|
||||
0: [1],
|
||||
1: [3]
|
||||
} if multiple_seqs_per_seq_group else {
|
||||
0: [1]
|
||||
}
|
||||
cross_block_table = [2]
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
seq_len = i % (model_runner.block_size - 1) + 1
|
||||
seq_data = SequenceData.from_seqs(range(seq_len))
|
||||
encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
|
||||
encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
|
||||
|
||||
seq_group_metadata = SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=False,
|
||||
seq_data={
|
||||
0: seq_data,
|
||||
1: seq_data
|
||||
} if multiple_seqs_per_seq_group else {0: seq_data},
|
||||
sampling_params=SamplingParams(temperature=0),
|
||||
block_tables=block_tables,
|
||||
encoder_seq_data=encoder_seq_data,
|
||||
cross_block_table=cross_block_table,
|
||||
)
|
||||
assert seq_group_metadata.token_chunk_size == 1
|
||||
seq_group_metadata_list.append(seq_group_metadata)
|
||||
seq_lens.extend(
|
||||
[seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
encoder_seq_lens.extend(
|
||||
[encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
|
||||
# Build
|
||||
# * Decoder model inputs
|
||||
# * Decoder self-attention KV caching data structures
|
||||
# * Encoder model inputs
|
||||
# * Encoder/decoder cross-attention KV caching data structures
|
||||
model_input = model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
input_tokens = model_input.input_tokens
|
||||
input_positions = model_input.input_positions
|
||||
attn_metadata = model_input.attn_metadata
|
||||
return_seq_lens = model_input.seq_lens
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
encoder_input_tokens = model_input.encoder_input_tokens
|
||||
encoder_input_positions = model_input.encoder_input_positions
|
||||
cross_slot_mapping = attn_metadata.cross_slot_mapping
|
||||
assert return_seq_lens == seq_lens
|
||||
assert len(slot_mapping) == len(input_tokens)
|
||||
assert len(cross_slot_mapping) == len(encoder_input_tokens)
|
||||
|
||||
# Verify input metadata is correct for decode phase.
|
||||
# - Decoder attention metadata
|
||||
device = model_runner.device
|
||||
assert attn_metadata.num_prefills == 0
|
||||
assert attn_metadata.num_decode_tokens > 0
|
||||
assert torch.equal(attn_metadata.seq_lens_tensor,
|
||||
torch.tensor(seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.seq_lens == seq_lens
|
||||
assert attn_metadata.max_prefill_seq_len == 0
|
||||
assert attn_metadata.max_decode_seq_len == max(seq_lens)
|
||||
# - Encoder attention metadata
|
||||
assert attn_metadata.encoder_seq_lens == encoder_seq_lens
|
||||
assert torch.equal(
|
||||
attn_metadata.encoder_seq_lens_tensor,
|
||||
torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
|
||||
assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
|
||||
|
||||
# Test decoder subquery start locs.
|
||||
start_idx = 0
|
||||
start_loc = [start_idx]
|
||||
for seq_len in seq_lens:
|
||||
start_idx += 1
|
||||
start_loc.append(start_idx)
|
||||
assert torch.equal(
|
||||
attn_metadata.query_start_loc,
|
||||
torch.tensor(start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
|
||||
# Test decoder seq start locs. Note that for normal prefill it is
|
||||
# equivalent to query_start_loc.
|
||||
start_idx = 0
|
||||
seq_start_loc = [start_idx]
|
||||
for seq_len in seq_lens:
|
||||
start_idx += seq_len
|
||||
seq_start_loc.append(start_idx)
|
||||
|
||||
# Test seq_start_loc and context lengths
|
||||
|
||||
assert torch.equal(
|
||||
attn_metadata.seq_start_loc,
|
||||
torch.tensor(seq_start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.context_lens_tensor,
|
||||
torch.tensor([seq_len - 1 for seq_len in seq_lens],
|
||||
dtype=torch.int,
|
||||
device=device))
|
||||
|
||||
# Verify block tables are correct for prompts
|
||||
# - Decoder self-attention
|
||||
flattened_block_tables = [
|
||||
block_table for block_table in block_tables.values()
|
||||
]
|
||||
expected = torch.tensor(flattened_block_tables *
|
||||
len(seq_group_metadata_list),
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device)
|
||||
assert torch.equal(
|
||||
attn_metadata.block_tables,
|
||||
expected,
|
||||
)
|
||||
# - Encoder/decoder cross-attention
|
||||
expected = torch.tensor([
|
||||
cross_block_table for seq_group_metadata in seq_group_metadata_list
|
||||
for _ in range(len(seq_group_metadata.seq_data))
|
||||
],
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device)
|
||||
assert torch.equal(
|
||||
attn_metadata.cross_block_tables,
|
||||
expected,
|
||||
)
|
||||
|
||||
# Model runner's CUDAGraph setting should be propagated to attention
|
||||
# metadata.
|
||||
assert attn_metadata.use_cuda_graph is False
|
||||
|
||||
# Verify the lengths of input tokens & positions
|
||||
# - Decoder
|
||||
assert len(input_tokens) == len(seq_lens)
|
||||
assert len(input_positions) == len(seq_lens)
|
||||
# -- An indirect check that model_input.input_tokens
|
||||
# and model_input.input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
)
|
||||
# - Encoder
|
||||
assert len(encoder_input_tokens) == 0
|
||||
assert len(encoder_input_tokens) == 0
|
||||
# -- An indirect check that model_input.encoder_input_tokens
|
||||
# and model_input.encoder_input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
)
|
||||
|
||||
# Test that vLLM sampling infrastructure chooses the correct
|
||||
# sequence positions at which to sample (i.e. the end of
|
||||
# each sequence) in the decode phase
|
||||
|
||||
expected_selected_token_indices = []
|
||||
for selected_token_start_idx, seq_len in enumerate(seq_lens):
|
||||
# Compute the index offset of the final token in each
|
||||
# sequence's decoded outputs; since a single token is
|
||||
# decoded per iteration per sequence, then the length
|
||||
# of the decoded tokens for a given sequence is 1 and
|
||||
# the final index offset into a given sequence's
|
||||
# generated tokens is 0 (i.e. the expected sampling index
|
||||
# for a given sequence is just `selected_token_start_idx`)
|
||||
expected_selected_token_indices.append(selected_token_start_idx)
|
||||
|
||||
sampling_metadata = model_input.sampling_metadata
|
||||
actual = sampling_metadata.selected_token_indices
|
||||
expected = torch.tensor(
|
||||
expected_selected_token_indices,
|
||||
device=actual.device,
|
||||
dtype=actual.dtype,
|
||||
)
|
||||
assert torch.equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size", list(range(1, 257)))
|
||||
@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
|
||||
def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
|
||||
"""
|
||||
Tests that for encoder-decoder models with CUDA Graph capture and replay
|
||||
enabled, the tensors used during the decode phase are correctly padded
|
||||
for varying input batch sizes.
|
||||
"""
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=False,
|
||||
)
|
||||
block_tables = {
|
||||
0: [1],
|
||||
1: [3]
|
||||
} if multiple_seqs_per_seq_group else {
|
||||
0: [1]
|
||||
}
|
||||
seq_lens: list[int] = []
|
||||
encoder_seq_lens: list[int] = []
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
|
||||
cross_block_table = [2]
|
||||
expanded_batch_size = 0
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
seq_len = i % (model_runner.block_size - 1) + 1
|
||||
seq_data = SequenceData.from_seqs(range(seq_len))
|
||||
encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
|
||||
encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
|
||||
seq_group_metadata = SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=False,
|
||||
seq_data={
|
||||
0: seq_data,
|
||||
1: seq_data
|
||||
} if multiple_seqs_per_seq_group else {0: seq_data},
|
||||
sampling_params=SamplingParams(temperature=0),
|
||||
block_tables=block_tables,
|
||||
encoder_seq_data=encoder_seq_data,
|
||||
cross_block_table=cross_block_table,
|
||||
)
|
||||
assert seq_group_metadata.token_chunk_size == 1
|
||||
seq_lens.extend(
|
||||
[seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
encoder_seq_lens.extend(
|
||||
[encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
expanded_batch_size = expanded_batch_size + len(
|
||||
seq_group_metadata.seq_data)
|
||||
seq_group_metadata_list.append(seq_group_metadata)
|
||||
|
||||
model_input = model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
input_tokens = model_input.input_tokens
|
||||
input_positions = model_input.input_positions
|
||||
attn_metadata = model_input.attn_metadata
|
||||
return_seq_lens = model_input.seq_lens
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
encoder_input_tokens = model_input.encoder_input_tokens
|
||||
encoder_input_positions = model_input.encoder_input_positions
|
||||
cross_slot_mapping = attn_metadata.cross_slot_mapping
|
||||
|
||||
# With CUDA Graph capture and replay enabled, the decoder and encoder
|
||||
# input sequences will be padded. Create the expected padded tensors
|
||||
# accordingly.
|
||||
graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
|
||||
expanded_batch_size)
|
||||
cuda_graph_pad_size = graph_batch_size - expanded_batch_size
|
||||
padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
|
||||
padded_encoder_seq_lens = encoder_seq_lens + list(
|
||||
itertools.repeat(1, cuda_graph_pad_size))
|
||||
|
||||
assert return_seq_lens == padded_seq_lens
|
||||
assert len(slot_mapping) == len(input_tokens)
|
||||
assert len(cross_slot_mapping) == len(encoder_input_tokens)
|
||||
|
||||
# Verify attention metadata
|
||||
device = model_runner.device
|
||||
assert attn_metadata.num_prefills == 0
|
||||
assert attn_metadata.num_decode_tokens > 0
|
||||
assert torch.equal(
|
||||
attn_metadata.seq_lens_tensor,
|
||||
torch.tensor(padded_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.seq_lens == padded_seq_lens
|
||||
assert attn_metadata.max_prefill_seq_len == 0
|
||||
assert attn_metadata.max_decode_seq_len == max(seq_lens)
|
||||
# - Encoder attention metadata
|
||||
assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens
|
||||
assert torch.equal(
|
||||
attn_metadata.encoder_seq_lens_tensor,
|
||||
torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens)
|
||||
assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens)
|
||||
|
||||
# Verify block tables are correct for prompts
|
||||
# - Decoder self-attention. Pad the block tables as expected.
|
||||
flattened_block_tables = [
|
||||
block_table for _ in range(len(seq_group_metadata_list))
|
||||
for block_table in block_tables.values()
|
||||
]
|
||||
flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)])
|
||||
expected = make_tensor_with_pad(
|
||||
flattened_block_tables,
|
||||
max_len=64,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device,
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.block_tables,
|
||||
expected,
|
||||
)
|
||||
# - Encoder/decoder cross-attention. Pad the cross-attention block tables
|
||||
# as expected.
|
||||
expected = [
|
||||
cross_block_table for seq_group_metadata in seq_group_metadata_list
|
||||
for _ in range(len(seq_group_metadata.seq_data))
|
||||
]
|
||||
expected.extend([[] for _ in range(cuda_graph_pad_size)])
|
||||
expected = make_tensor_with_pad(
|
||||
expected,
|
||||
max_len=64,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device,
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.cross_block_tables,
|
||||
expected,
|
||||
)
|
||||
|
||||
# Model runner's CUDAGraph setting should be propagated to attention
|
||||
# metadata.
|
||||
assert attn_metadata.use_cuda_graph is True
|
||||
|
||||
# Verify the lengths of input tokens & positions
|
||||
# - Decoder
|
||||
assert len(input_tokens) == len(padded_seq_lens)
|
||||
assert len(input_positions) == len(padded_seq_lens)
|
||||
# -- An indirect check that model_input.input_tokens
|
||||
# and model_input.input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
)
|
||||
# - Encoder
|
||||
assert len(encoder_input_tokens) == 0
|
||||
assert len(encoder_input_tokens) == 0
|
||||
# -- An indirect check that model_input.encoder_input_tokens
|
||||
# and model_input.encoder_input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
)
|
@ -1201,11 +1201,8 @@ class ModelConfig:
|
||||
getattr(self.hf_config, "max_source_positions", 0))
|
||||
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
|
||||
effective_max_seq_len)
|
||||
# CUDAGraph capture not supported for enc-dec models and mllama on ROCm
|
||||
ROCM_UNSUPPORTED_MODELS = ['mllama']
|
||||
unsupported_rocm = (self.hf_config.model_type
|
||||
in ROCM_UNSUPPORTED_MODELS
|
||||
or self.is_encoder_decoder)
|
||||
# CUDAGraph capture not supported for encoder-decoder models on ROCm
|
||||
unsupported_rocm = self.is_encoder_decoder
|
||||
|
||||
if (unsupported_rocm and not self.enforce_eager
|
||||
and current_platform.is_rocm()):
|
||||
@ -1671,10 +1668,6 @@ class ModelConfig:
|
||||
@property
|
||||
def is_encoder_decoder(self) -> bool:
|
||||
"""Extract the HF encoder/decoder model flag."""
|
||||
"""
|
||||
For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
|
||||
True to enable cross-attention
|
||||
"""
|
||||
return is_encoder_decoder(self.hf_config)
|
||||
|
||||
@property
|
||||
|
@ -1789,7 +1789,7 @@ class LLMEngine:
|
||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||
|
||||
if mm_processor.pad_dummy_encoder_prompt:
|
||||
return # Skip encoder length check for Whisper and Donut
|
||||
return # Skip encoder length check for Whisper
|
||||
|
||||
if model_config.is_multimodal_model:
|
||||
suggestion = (
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,381 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Annotated, Literal, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import BatchFeature, NougatProcessor
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.bart import BartParallelLMHead, MBartDecoder
|
||||
from vllm.model_executor.models.interfaces import (MultiModalEmbeddings,
|
||||
SupportsMultiModal,
|
||||
SupportsV0Only)
|
||||
from vllm.model_executor.models.swin import SwinModel
|
||||
from vllm.model_executor.models.utils import (AutoWeightsLoader,
|
||||
_flatten_embeddings, flatten_bn)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseProcessingInfo,
|
||||
EncDecMultiModalProcessor,
|
||||
PromptIndexTargets, PromptInsertion,
|
||||
PromptUpdate)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
|
||||
class MBartDecoderWrapper(nn.Module):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.decoder = MBartDecoder(config,
|
||||
cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.decoder")
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
return self.decoder(*args, **kwargs)
|
||||
|
||||
|
||||
class DonutLanguageForConditionalGeneration(nn.Module, SupportsV0Only):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
|
||||
self.config = config
|
||||
self.model = MBartDecoderWrapper(vllm_config=vllm_config,
|
||||
prefix=f"{prefix}.model")
|
||||
embed_scale = math.sqrt(
|
||||
config.d_model) if config.scale_embedding else 1.0
|
||||
|
||||
self.vocab_size = config.vocab_size
|
||||
self.lm_head = BartParallelLMHead(self.vocab_size,
|
||||
config.d_model,
|
||||
embed_scale=embed_scale)
|
||||
|
||||
self.logits_processor = LogitsProcessor(self.vocab_size,
|
||||
config.vocab_size)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
inputs_embeds: torch.Tensor,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
r"""
|
||||
Args:
|
||||
input_ids: torch.Tensor of *decoder* input token ids.
|
||||
positions: torch.Tensor of *decoder* position indices.
|
||||
Returns:
|
||||
Output torch.Tensor
|
||||
"""
|
||||
|
||||
return self.model(decoder_input_ids=input_ids,
|
||||
decoder_positions=positions,
|
||||
encoder_hidden_states=inputs_embeds)
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[torch.Tensor]:
|
||||
logits = self.logits_processor(self.lm_head, hidden_states,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
if "final_logits_bias" in name:
|
||||
continue
|
||||
# if self.config.tie_word_embeddings and "embed_tokens" in name:
|
||||
# continue
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
return loaded_params
|
||||
|
||||
|
||||
class DonutImagePixelInputs(TensorSchema):
|
||||
"""
|
||||
Dimensions:
|
||||
- b: Batch size
|
||||
- c: Number of channels (3)
|
||||
- h: Height
|
||||
- w: Width
|
||||
"""
|
||||
type: Literal["pixel_values"]
|
||||
data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
|
||||
|
||||
|
||||
class DonutProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config()
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor()
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
def get_num_image_tokens(self) -> int:
|
||||
return 1
|
||||
|
||||
|
||||
class DonutDummyInputsBuilder(BaseDummyInputsBuilder[DonutProcessingInfo]):
|
||||
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
return ""
|
||||
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = self.info.get_hf_config(
|
||||
).encoder.image_size
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
|
||||
|
||||
class DonutMultiModalProcessor(EncDecMultiModalProcessor[DonutProcessingInfo]):
|
||||
|
||||
def _hf_processor_applies_updates(
|
||||
self,
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
return False
|
||||
|
||||
def create_encoder_prompt(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
) -> Union[str, list[int]]:
|
||||
return prompt
|
||||
|
||||
def create_decoder_prompt(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
) -> Union[str, list[int]]:
|
||||
return prompt
|
||||
|
||||
@property
|
||||
def pad_dummy_encoder_prompt(self) -> bool:
|
||||
return True
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
if mm_data:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt, mm_data, mm_kwargs, tok_kwargs)
|
||||
if isinstance(hf_processor, NougatProcessor):
|
||||
processed_outputs["input_ids"] = processed_outputs["labels"]
|
||||
else:
|
||||
tokenizer = hf_processor.tokenizer
|
||||
processed_outputs = tokenizer(prompt,
|
||||
add_special_tokens=False,
|
||||
return_tensors="pt")
|
||||
return processed_outputs
|
||||
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
tokenizer = hf_processor.tokenizer
|
||||
pad_token_id = tokenizer.pad_token_id
|
||||
num_image_tokens = self.info.get_num_image_tokens()
|
||||
image_tokens = [pad_token_id] * num_image_tokens
|
||||
|
||||
return [
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.start(),
|
||||
insertion=image_tokens,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(DonutMultiModalProcessor,
|
||||
info=DonutProcessingInfo,
|
||||
dummy_inputs=DonutDummyInputsBuilder)
|
||||
class DonutForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
SupportsV0Only):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
processor_config = vllm_config.model_config.hf_image_processor_config
|
||||
|
||||
self.config = config
|
||||
self.vision_config = config.encoder
|
||||
self.processor_config = processor_config
|
||||
self.encoder = SwinModel(config=config.encoder)
|
||||
|
||||
self.decoder = DonutLanguageForConditionalGeneration(
|
||||
vllm_config=vllm_config.with_hf_config(config.decoder),
|
||||
prefix=f"{prefix}.decoder",
|
||||
)
|
||||
self.pad_token_id = config.pad_token_id
|
||||
|
||||
def _parse_and_validate_image_input(self, **kwargs: object):
|
||||
pixel_values: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"pixel_values", None)
|
||||
image_embeds: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"image_embeds", None)
|
||||
|
||||
if pixel_values is None and image_embeds is None:
|
||||
return None
|
||||
|
||||
if pixel_values is not None and image_embeds is not None:
|
||||
raise ValueError(
|
||||
"Both pixel values and image embeds are provided.")
|
||||
|
||||
if pixel_values is not None:
|
||||
h, w = self.config.encoder.image_size
|
||||
return DonutImagePixelInputs(type="pixel_values",
|
||||
data=flatten_bn(pixel_values,
|
||||
concat=True),
|
||||
resolve_bindings={
|
||||
"h": h,
|
||||
"w": w,
|
||||
})
|
||||
|
||||
if image_embeds is not None:
|
||||
raise NotImplementedError
|
||||
|
||||
raise AssertionError("This line should be unreachable.")
|
||||
|
||||
def _process_image_input(
|
||||
self, image_input: DonutImagePixelInputs) -> torch.Tensor:
|
||||
assert image_input["type"] == "pixel_values"
|
||||
pixel_values = image_input["data"]
|
||||
dtype = next(self.encoder.parameters()).dtype
|
||||
pixel_values = pixel_values.to(dtype)
|
||||
return self.encoder(pixel_values)
|
||||
|
||||
def get_language_model(self) -> torch.nn.Module:
|
||||
return self.decoder
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return None
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
return vision_embeddings
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: MultiModalEmbeddings,
|
||||
) -> torch.Tensor:
|
||||
return _flatten_embeddings(multimodal_embeddings)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
*,
|
||||
encoder_input_ids: torch.Tensor,
|
||||
encoder_positions: torch.Tensor,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
r"""
|
||||
Args:
|
||||
input_ids: torch.Tensor of *decoder* input token ids.
|
||||
positions: torch.Tensor of *decoder* position indices.
|
||||
encoder_input_ids: torch.Tensor of *encoder* input token ids.
|
||||
encoder_positions: torch.Tensor of *encoder* position indices
|
||||
Returns:
|
||||
Output torch.Tensor
|
||||
"""
|
||||
|
||||
inputs_embeds = None
|
||||
if encoder_input_ids.numel() > 0:
|
||||
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
|
||||
inputs_embeds = self.get_input_embeddings(encoder_input_ids,
|
||||
vision_embeddings)
|
||||
|
||||
hidden_states = self.decoder(input_ids,
|
||||
positions,
|
||||
inputs_embeds=inputs_embeds)
|
||||
return hidden_states
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[torch.Tensor]:
|
||||
return self.decoder.compute_logits(hidden_states, sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -147,10 +147,6 @@ _TEXT_GENERATION_MODELS = {
|
||||
"TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
|
||||
"XverseForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||
"Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
|
||||
# [Encoder-decoder]
|
||||
"BartModel": ("bart", "BartForConditionalGeneration"),
|
||||
"BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
|
||||
"MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"),
|
||||
}
|
||||
|
||||
_EMBEDDING_MODELS = {
|
||||
@ -237,6 +233,7 @@ _MULTIMODAL_MODELS = {
|
||||
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
|
||||
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
|
||||
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
|
||||
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
|
||||
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
|
||||
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
||||
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
|
||||
@ -263,16 +260,12 @@ _MULTIMODAL_MODELS = {
|
||||
"Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
|
||||
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
|
||||
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
|
||||
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
|
||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
||||
# [Encoder-decoder]
|
||||
"DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"),
|
||||
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
|
||||
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
|
||||
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
|
||||
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
|
||||
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
|
||||
}
|
||||
|
||||
|
@ -209,7 +209,7 @@ class MultiModalProfiler(Generic[_I]):
|
||||
if processor.pad_dummy_encoder_prompt:
|
||||
num_tokens_to_pad = max(total_len, seq_len) - total_len
|
||||
encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
|
||||
# NOTE: Whisper and Donut allows total_len > seq_len.
|
||||
# NOTE: Whisper allows total_len > seq_len.
|
||||
elif total_len > seq_len and not envs.VLLM_USE_V1:
|
||||
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
||||
logger.warning_once(
|
||||
|
@ -36,7 +36,6 @@ MODELS_ON_S3 = [
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
# "meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Meta-Llama-3-8B",
|
||||
|
@ -35,7 +35,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
|
||||
"chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
|
||||
"florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
|
||||
"minicpmv": _get_minicpmv_chat_template_fallback,
|
||||
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
|
@ -90,11 +90,6 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
|
||||
"internvl_chat": {
|
||||
"has_no_defaults_at_init": True
|
||||
},
|
||||
# transformers regards mllama as is_encoder_decoder=False
|
||||
# vllm needs is_encoder_decoder=True to enable cross-attention
|
||||
"mllama": {
|
||||
"is_encoder_decoder": True
|
||||
},
|
||||
"NVLM_D": {
|
||||
"has_no_defaults_at_init": True
|
||||
},
|
||||
|
@ -498,7 +498,7 @@ class Processor:
|
||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||
|
||||
if mm_processor.pad_dummy_encoder_prompt:
|
||||
return # Skip encoder length check for Whisper and Donut
|
||||
return # Skip encoder length check for Whisper
|
||||
|
||||
if model_config.is_multimodal_model:
|
||||
suggestion = (
|
||||
|
@ -1,553 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
import itertools
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, cast
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadata)
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.attention.selector import (get_env_variable_attn_backend,
|
||||
get_global_forced_attn_backend)
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor import SamplingMetadata
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
|
||||
MultiModalRegistry)
|
||||
from vllm.platforms import _Backend
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
||||
from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
|
||||
from vllm.worker.model_runner import (GPUModelRunnerBase,
|
||||
ModelInputForGPUBuilder,
|
||||
ModelInputForGPUWithSamplingMetadata)
|
||||
from vllm.worker.model_runner_base import (
|
||||
_add_attn_metadata_broadcastable_dict,
|
||||
_add_sampling_metadata_broadcastable_dict)
|
||||
from vllm.worker.utils import assert_enc_dec_mr_supported_scenario
|
||||
|
||||
logger = init_logger(__name__)
|
||||
LORA_WARMUP_RANK = 8
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
|
||||
"""
|
||||
Used by the EncoderDecoderModelRunner.
|
||||
"""
|
||||
encoder_input_tokens: Optional[torch.Tensor] = None
|
||||
encoder_input_positions: Optional[torch.Tensor] = None
|
||||
|
||||
def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
|
||||
tensor_dict = {
|
||||
"input_tokens": self.input_tokens,
|
||||
"inputs_embeds": self.inputs_embeds,
|
||||
"input_positions": self.input_positions,
|
||||
"encoder_input_tokens": self.encoder_input_tokens,
|
||||
"encoder_input_positions": self.encoder_input_positions,
|
||||
"virtual_engine": self.virtual_engine,
|
||||
"request_ids_to_seq_ids": self.request_ids_to_seq_ids,
|
||||
"finished_requests_ids": self.finished_requests_ids,
|
||||
"multi_modal_kwargs": self.multi_modal_kwargs,
|
||||
}
|
||||
_add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
|
||||
_add_sampling_metadata_broadcastable_dict(tensor_dict,
|
||||
self.sampling_metadata)
|
||||
return tensor_dict
|
||||
|
||||
@classmethod
|
||||
def from_broadcasted_tensor_dict(
|
||||
cls,
|
||||
tensor_dict: Dict[str, Any],
|
||||
attn_backend: Optional["AttentionBackend"] = None,
|
||||
) -> "EncoderDecoderModelInput":
|
||||
return cast(
|
||||
EncoderDecoderModelInput,
|
||||
super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
|
||||
|
||||
|
||||
class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
|
||||
_model_input_cls: Type[EncoderDecoderModelInput] = (
|
||||
EncoderDecoderModelInput)
|
||||
_builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
):
|
||||
'''
|
||||
EncoderDecoderModelRunner constructor.
|
||||
|
||||
`lora_config` is unused (since these features are not yet supported
|
||||
for encoder/decoder models) but these arguments are present here for
|
||||
compatibility with the base-class constructor.
|
||||
'''
|
||||
self._maybe_force_supported_attention_backend()
|
||||
|
||||
super().__init__(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
input_registry=input_registry,
|
||||
mm_registry=mm_registry,
|
||||
)
|
||||
|
||||
# Crash for unsupported encoder/scenarios
|
||||
assert_enc_dec_mr_supported_scenario(self)
|
||||
|
||||
def _maybe_force_supported_attention_backend(self):
|
||||
'''
|
||||
Force vLLM to use the XFormers attention backend,
|
||||
which is currently the only supported option.
|
||||
'''
|
||||
|
||||
def raise_backend_err():
|
||||
# The user has specified an attention backend override
|
||||
# which is invalid for encoder/decoder models
|
||||
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_BACKEND)
|
||||
|
||||
maybe_env_var_forced_backend = get_env_variable_attn_backend()
|
||||
maybe_global_forced_backend = get_global_forced_attn_backend()
|
||||
is_forced_by_global = maybe_global_forced_backend is not None
|
||||
is_forced_by_env_var = maybe_env_var_forced_backend is not None
|
||||
if is_forced_by_global: # noqa: SIM102
|
||||
# Backend override enforced by global variable takes
|
||||
# precedence over vLLM backend environment variable.
|
||||
if maybe_global_forced_backend not in\
|
||||
[_Backend.XFORMERS, _Backend.FLASH_ATTN]:
|
||||
raise_backend_err()
|
||||
elif is_forced_by_env_var: # noqa: SIM102
|
||||
# Backend override enforced by vLLM backend
|
||||
# environment variable
|
||||
if maybe_env_var_forced_backend not in\
|
||||
[_Backend.XFORMERS, _Backend.FLASH_ATTN]:
|
||||
raise_backend_err()
|
||||
|
||||
def _list_to_int32_tensor(
|
||||
self,
|
||||
_list: List[int],
|
||||
) -> torch.Tensor:
|
||||
return torch.tensor(_list, dtype=torch.int32, device=self.device)
|
||||
|
||||
def _list_to_long_tensor(
|
||||
self,
|
||||
_list: List[int],
|
||||
) -> torch.Tensor:
|
||||
return torch.tensor(_list, dtype=torch.long, device=self.device)
|
||||
|
||||
def _empty_int32_tensor(self) -> torch.Tensor:
|
||||
return self._list_to_int32_tensor([])
|
||||
|
||||
def _empty_long_tensor(self) -> torch.Tensor:
|
||||
return self._list_to_long_tensor([])
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
model_input: EncoderDecoderModelInput,
|
||||
kv_caches: List[torch.Tensor],
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
num_steps: int = 1,
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
if num_steps > 1:
|
||||
raise ValueError("num_steps > 1 is not supported in "
|
||||
"EncoderDecoderModelRunner")
|
||||
if self.lora_config:
|
||||
assert model_input.lora_requests is not None
|
||||
assert model_input.lora_mapping is not None
|
||||
self.set_active_loras(model_input.lora_requests,
|
||||
model_input.lora_mapping)
|
||||
if (model_input.attn_metadata is not None
|
||||
and model_input.attn_metadata.prefill_metadata is None
|
||||
and model_input.attn_metadata.decode_metadata.use_cuda_graph):
|
||||
if model_input.inputs_embeds is None:
|
||||
assert model_input.input_tokens is not None
|
||||
graph_batch_size = model_input.input_tokens.shape[0]
|
||||
model_executable = (
|
||||
self.graph_runners[model_input.virtual_engine][(
|
||||
graph_batch_size, False)])
|
||||
else:
|
||||
graph_batch_size = model_input.inputs_embeds.shape[0]
|
||||
model_executable = (
|
||||
self.graph_runners[model_input.virtual_engine][(
|
||||
graph_batch_size, True)])
|
||||
else:
|
||||
model_executable = self.model
|
||||
|
||||
seqlen_agnostic_kwargs = {
|
||||
"finished_requests_ids": model_input.finished_requests_ids,
|
||||
"request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
|
||||
} if self.has_inner_state else {}
|
||||
|
||||
multi_modal_kwargs = model_input.multi_modal_kwargs or {}
|
||||
with set_forward_context(model_input.attn_metadata, self.vllm_config,
|
||||
model_input.virtual_engine):
|
||||
hidden_or_intermediate_states = model_executable(
|
||||
input_ids=model_input.input_tokens,
|
||||
inputs_embeds=model_input.inputs_embeds,
|
||||
positions=model_input.input_positions,
|
||||
encoder_input_ids=model_input.encoder_input_tokens,
|
||||
encoder_positions=model_input.encoder_input_positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalKwargs.as_kwargs(
|
||||
multi_modal_kwargs,
|
||||
device=self.device,
|
||||
),
|
||||
**seqlen_agnostic_kwargs,
|
||||
)
|
||||
|
||||
logits = self.model.compute_logits(hidden_or_intermediate_states,
|
||||
model_input.sampling_metadata)
|
||||
|
||||
if not self.is_driver_worker:
|
||||
return []
|
||||
|
||||
if model_input.async_callback is not None:
|
||||
model_input.async_callback()
|
||||
|
||||
# Sample the next token.
|
||||
output: SamplerOutput = self.sampler(
|
||||
logits=logits,
|
||||
sampling_metadata=model_input.sampling_metadata,
|
||||
)
|
||||
|
||||
return [output]
|
||||
|
||||
def make_model_input_from_broadcasted_tensor_dict(
|
||||
self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput:
|
||||
return EncoderDecoderModelInput.from_broadcasted_tensor_dict(
|
||||
tensor_dict,
|
||||
attn_backend=self.attn_backend,
|
||||
)
|
||||
|
||||
def prepare_model_input(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
virtual_engine: int = 0,
|
||||
finished_requests_ids: Optional[List[str]] = None
|
||||
) -> EncoderDecoderModelInput:
|
||||
"""Prepare the model input based on a given sequence group, including
|
||||
metadata for the sampling step.
|
||||
|
||||
Since chunked prefill is not supported for encoder/decoder models,
|
||||
`input_tokens` is assumed to be either entirely prefill tokens or
|
||||
entirely decode tokens.
|
||||
|
||||
"""
|
||||
model_input = self._prepare_model_input_tensors(
|
||||
seq_group_metadata_list, finished_requests_ids)
|
||||
(
|
||||
attn_metadata,
|
||||
encoder_input_tokens_tensor,
|
||||
encoder_input_positions_tensor,
|
||||
) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
|
||||
model_input))
|
||||
# Inject attn_metadata encoder/cross-attention fields &
|
||||
# encoder input tokens/positions into model_input.
|
||||
# Frozen dataclass fields cannot be modified, so use
|
||||
# dataclasses.replace to construct a new model input
|
||||
# instance.
|
||||
model_input = dataclasses.replace(
|
||||
model_input,
|
||||
attn_metadata=attn_metadata,
|
||||
encoder_input_tokens=encoder_input_tokens_tensor,
|
||||
encoder_input_positions=encoder_input_positions_tensor,
|
||||
)
|
||||
|
||||
generators = self.get_generators(finished_requests_ids)
|
||||
sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
|
||||
model_input.seq_lens,
|
||||
model_input.query_lens,
|
||||
self.device,
|
||||
self.pin_memory,
|
||||
generators=generators)
|
||||
is_prompt = (seq_group_metadata_list[0].is_prompt
|
||||
if seq_group_metadata_list else None)
|
||||
return dataclasses.replace(model_input,
|
||||
sampling_metadata=sampling_metadata,
|
||||
is_prompt=is_prompt,
|
||||
virtual_engine=virtual_engine)
|
||||
|
||||
@torch.inference_mode()
|
||||
def profile_run(self) -> None:
|
||||
# Enable top-k sampling to reflect the accurate memory usage.
|
||||
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
|
||||
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
max_num_seqs = self.scheduler_config.max_num_seqs
|
||||
|
||||
# This represents the maximum number of different requests
|
||||
# that will have unique loras, and therefore the max amount of
|
||||
# memory consumption. Create dummy lora request copies from the
|
||||
# lora request passed in, which contains a lora from the lora
|
||||
# warmup path.
|
||||
dummy_lora_requests: List[LoRARequest] = []
|
||||
dummy_lora_requests_per_seq: List[LoRARequest] = []
|
||||
if self.lora_config:
|
||||
dummy_lora_requests = self._add_dummy_loras(
|
||||
self.lora_config.max_loras)
|
||||
assert len(dummy_lora_requests) == self.lora_config.max_loras
|
||||
dummy_lora_requests_per_seq = [
|
||||
dummy_lora_requests[idx % len(dummy_lora_requests)]
|
||||
for idx in range(max_num_seqs)
|
||||
]
|
||||
|
||||
# Profile memory usage with max_num_sequences sequences and the total
|
||||
# number of tokens equal to max_num_batched_tokens.
|
||||
seqs: List[SequenceGroupMetadata] = []
|
||||
|
||||
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
|
||||
self.model_config)
|
||||
if max_mm_tokens > 0:
|
||||
logger.info("Starting profile run for multi-modal models.")
|
||||
|
||||
batch_size = 0
|
||||
for group_id in range(max_num_seqs):
|
||||
seq_len = (max_num_batched_tokens // max_num_seqs +
|
||||
(group_id < max_num_batched_tokens % max_num_seqs))
|
||||
batch_size += seq_len
|
||||
|
||||
decoder_dummy_data = self.input_registry \
|
||||
.dummy_data_for_profiling(self.model_config,
|
||||
seq_len,
|
||||
self.mm_registry,
|
||||
is_encoder_data=False)
|
||||
encoder_dummy_data = self.input_registry \
|
||||
.dummy_data_for_profiling(self.model_config,
|
||||
seq_len,
|
||||
self.mm_registry,
|
||||
is_encoder_data=True)
|
||||
|
||||
# Having more tokens is over-conservative but otherwise fine
|
||||
assert len(
|
||||
decoder_dummy_data.seq_data.prompt_token_ids
|
||||
) >= seq_len, (
|
||||
f"Expected at least {seq_len} dummy tokens for profiling, "
|
||||
f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}"
|
||||
)
|
||||
|
||||
assert decoder_dummy_data.multi_modal_data is None or \
|
||||
encoder_dummy_data.multi_modal_data is None, (
|
||||
"Multi-modal data can't be provided in both encoder and decoder"
|
||||
)
|
||||
|
||||
seq = SequenceGroupMetadata(
|
||||
request_id=str(group_id),
|
||||
is_prompt=True,
|
||||
seq_data={group_id: decoder_dummy_data.seq_data},
|
||||
sampling_params=sampling_params,
|
||||
block_tables=None,
|
||||
encoder_seq_data=encoder_dummy_data.seq_data,
|
||||
cross_block_table=None,
|
||||
lora_request=dummy_lora_requests_per_seq[group_id]
|
||||
if dummy_lora_requests_per_seq else None,
|
||||
multi_modal_data=decoder_dummy_data.multi_modal_data
|
||||
or encoder_dummy_data.multi_modal_data,
|
||||
multi_modal_placeholders=decoder_dummy_data.
|
||||
multi_modal_placeholders
|
||||
or encoder_dummy_data.multi_modal_placeholders)
|
||||
seqs.append(seq)
|
||||
|
||||
finished_requests_ids = [seq.request_id for seq in seqs]
|
||||
model_input = self.prepare_model_input(
|
||||
seqs, finished_requests_ids=finished_requests_ids)
|
||||
intermediate_tensors = None
|
||||
self.execute_model(model_input, None, intermediate_tensors)
|
||||
torch.cuda.synchronize()
|
||||
return
|
||||
|
||||
def _prepare_encoder_model_input_tensors(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
model_input: EncoderDecoderModelInput,
|
||||
) -> Tuple[AttentionMetadata, Optional[torch.Tensor],
|
||||
Optional[torch.Tensor]]:
|
||||
"""Helper method to prepare the encoder- and cross-attn-related
|
||||
model inputs based on a given sequence group. These additional inputs
|
||||
are used to augment an already-computed `EncoderDecoderModelInput`
|
||||
data structure which already has decoder-related model inputs
|
||||
populated.
|
||||
|
||||
Sets the following attn_metadata fields:
|
||||
* `num_encoder_tokens`
|
||||
* `encoder_seq_lens`
|
||||
* `encoder_seq_lens_tensor`
|
||||
* `max_encoder_seq_len`
|
||||
* `cross_slot_mapping`
|
||||
* `cross_block_tables`
|
||||
|
||||
Constructs a new model inputs data structure, based on
|
||||
(1) the existing fields in the `model_inputs` argument,
|
||||
and (2) the following additional fields which are
|
||||
computed (or in the case of `attn_metadata`, updated)
|
||||
by this function:
|
||||
* attn_metadata
|
||||
* encoder_input_tokens
|
||||
* encoder_input_positions
|
||||
|
||||
Arguments:
|
||||
|
||||
* seq_group_metadata_list: list of sequence groups for which to
|
||||
compute inputs
|
||||
* model_inputs: model inputs data structure with decoder-oriented
|
||||
fields already computed.
|
||||
|
||||
Return:
|
||||
|
||||
* Updated model inputs data structure
|
||||
"""
|
||||
|
||||
if len(seq_group_metadata_list) == 0:
|
||||
return (model_input.attn_metadata, None, None)
|
||||
|
||||
# Since we are not supporting chunked prefill either the entire
|
||||
# batch is prefill or it is decode
|
||||
is_prompt = seq_group_metadata_list[0].is_prompt
|
||||
|
||||
# Build encoder inputs
|
||||
encoder_seq_lens: List[int] = []
|
||||
if is_prompt:
|
||||
# Prefill phase.
|
||||
cross_block_tables = self._empty_int32_tensor().view(
|
||||
len(seq_group_metadata_list), -1)
|
||||
|
||||
# Extract input tokens/positions, cross-attention slot-mapping,
|
||||
# & seq len from each sequence group metadata
|
||||
(
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
cross_slot_mapping,
|
||||
) = (
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
)
|
||||
for seq_group_metadata in seq_group_metadata_list:
|
||||
# Build seq lens
|
||||
seq_len = seq_group_metadata.encoder_seq_data.get_len()
|
||||
token_ids = seq_group_metadata.encoder_seq_data.get_token_ids()
|
||||
encoder_seq_lens.append(seq_len)
|
||||
|
||||
# Build slot mapping
|
||||
is_profile_run = (seq_group_metadata.block_tables is None)
|
||||
if is_profile_run:
|
||||
# During memory profiling, the block tables are not
|
||||
# initialized yet. In this case, we just use a dummy
|
||||
# slot mapping.
|
||||
# In embeddings, the block tables are {seq_id: None}.
|
||||
cross_slot_mapping.extend([PAD_SLOT_ID] * seq_len)
|
||||
else:
|
||||
for i in range(0, seq_len):
|
||||
block_number = seq_group_metadata.cross_block_table[
|
||||
i // self.block_size]
|
||||
block_offset = i % self.block_size
|
||||
slot = block_number * self.block_size + block_offset
|
||||
cross_slot_mapping.append(slot)
|
||||
|
||||
# Build encoder input tokens
|
||||
encoder_input_tokens.extend(token_ids)
|
||||
encoder_input_positions.extend(list(range(0, seq_len)))
|
||||
|
||||
# Convert tokens/positions & cross-attention
|
||||
# slot-mapping to encoder input tensors
|
||||
encoder_input_tokens_tensor = self._list_to_long_tensor(
|
||||
encoder_input_tokens)
|
||||
encoder_input_positions_tensor = self._list_to_long_tensor(
|
||||
encoder_input_positions)
|
||||
cross_slot_mapping_tensor = self._list_to_long_tensor(
|
||||
cross_slot_mapping)
|
||||
|
||||
else:
|
||||
# Decode phase.
|
||||
encoder_input_tokens_tensor = self._empty_long_tensor()
|
||||
encoder_input_positions_tensor = self._empty_long_tensor()
|
||||
cross_slot_mapping_tensor = self._empty_long_tensor()
|
||||
# Extract cross-attention block tables &
|
||||
# seq len from each sequence group metadata.
|
||||
# Cross-attention block tables are empty
|
||||
# during vLLM memory profiling.
|
||||
cross_block_tables = []
|
||||
for seq_group_metadata in seq_group_metadata_list:
|
||||
for _ in range(len(seq_group_metadata.seq_data)):
|
||||
encoder_seq_lens.append(
|
||||
seq_group_metadata.encoder_seq_data.get_len())
|
||||
cross_block_table = seq_group_metadata.cross_block_table
|
||||
cross_block_tables.append([] if (
|
||||
cross_block_table is None) else cross_block_table)
|
||||
|
||||
if (model_input.attn_metadata is not None
|
||||
and model_input.attn_metadata.use_cuda_graph):
|
||||
# We will be using CUDA graph replay for this decode.
|
||||
max_len_of_block_table = self.get_max_block_per_batch()
|
||||
batch_size = len(encoder_seq_lens)
|
||||
graph_batch_size = self.vllm_config.pad_for_cudagraph(
|
||||
batch_size)
|
||||
assert graph_batch_size >= batch_size
|
||||
cuda_graph_pad_size = graph_batch_size - batch_size
|
||||
# extend the cross_block_tables and encoder_seq_lens to match
|
||||
# the graph_batch_size.
|
||||
cross_block_tables.extend([[]
|
||||
for _ in range(cuda_graph_pad_size)
|
||||
])
|
||||
encoder_seq_lens.extend(
|
||||
itertools.repeat(1, cuda_graph_pad_size))
|
||||
|
||||
else:
|
||||
max_len_of_block_table = max(
|
||||
len(block_table) for block_table in cross_block_tables)
|
||||
|
||||
cross_block_tables = make_tensor_with_pad(
|
||||
cross_block_tables,
|
||||
max_len=max_len_of_block_table,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
# Compute encoder sequence lengths & encoder
|
||||
# sequence starting offset tensors
|
||||
max_encoder_seq_len = max(encoder_seq_lens, default=0)
|
||||
encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens)
|
||||
encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] +
|
||||
1,
|
||||
dtype=torch.int32,
|
||||
device=self.device)
|
||||
torch.cumsum(encoder_seq_lens_tensor,
|
||||
dim=0,
|
||||
dtype=encoder_seq_start_loc.dtype,
|
||||
out=encoder_seq_start_loc[1:])
|
||||
|
||||
# Update attention metadata with encoder-oriented attributes
|
||||
attn_metadata = model_input.attn_metadata
|
||||
assert attn_metadata is not None
|
||||
(
|
||||
attn_metadata.num_encoder_tokens,
|
||||
attn_metadata.encoder_seq_lens,
|
||||
attn_metadata.encoder_seq_lens_tensor,
|
||||
attn_metadata.max_encoder_seq_len,
|
||||
attn_metadata.encoder_seq_start_loc,
|
||||
attn_metadata.cross_slot_mapping,
|
||||
attn_metadata.cross_block_tables,
|
||||
) = (
|
||||
sum(encoder_seq_lens),
|
||||
encoder_seq_lens,
|
||||
encoder_seq_lens_tensor,
|
||||
max_encoder_seq_len,
|
||||
encoder_seq_start_loc,
|
||||
cross_slot_mapping_tensor,
|
||||
cross_block_tables,
|
||||
)
|
||||
|
||||
return (attn_metadata, encoder_input_tokens_tensor,
|
||||
encoder_input_positions_tensor)
|
@ -1,49 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
'''
|
||||
Worker-related helper functions.
|
||||
'''
|
||||
|
||||
from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS
|
||||
from vllm.worker.model_runner import GPUModelRunnerBase
|
||||
|
||||
|
||||
def assert_enc_dec_mr_supported_scenario(
|
||||
enc_dec_mr: GPUModelRunnerBase) -> None:
|
||||
'''
|
||||
Asserted that the provided encoder/decoder model runner instance reflects
|
||||
a supported scenario.
|
||||
'''
|
||||
|
||||
# Reminder: Please update docs/features/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
|
||||
if enc_dec_mr.cache_config.enable_prefix_caching:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
|
||||
|
||||
if enc_dec_mr.sliding_window is not None:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SWA'])
|
||||
|
||||
if enc_dec_mr.scheduler_config.chunked_prefill_enabled:
|
||||
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
|
||||
'STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL'])
|
||||
|
||||
if getattr(enc_dec_mr.model_config.hf_config, 'attn_logit_softcapping',
|
||||
None) is not None:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP']
|
||||
)
|
||||
|
||||
if enc_dec_mr.lora_config is not None:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LORA'])
|
||||
|
||||
if enc_dec_mr.parallel_config.pipeline_parallel_size > 1:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP'])
|
||||
|
||||
if enc_dec_mr.scheduler_config.num_lookahead_slots > 0:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC'])
|
@ -28,7 +28,6 @@ from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
|
||||
from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
|
||||
memory_profiling)
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
|
||||
from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
|
||||
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
|
||||
WorkerInput)
|
||||
@ -82,10 +81,7 @@ class Worker(LocalOrDistributedWorkerBase):
|
||||
"qwen3_next_mtp")) \
|
||||
else {"return_hidden_states": True}
|
||||
|
||||
ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
|
||||
if self.model_config.is_encoder_decoder:
|
||||
ModelRunnerClass = EncoderDecoderModelRunner
|
||||
self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
|
||||
self.model_runner: GPUModelRunnerBase = ModelRunner(
|
||||
vllm_config=self.vllm_config,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
|
Reference in New Issue
Block a user