helper for structured data

remove unused import
remove formatting to user side
2025-10-20 10:03:51 +08:00 · 2025-10-15 19:18:53 +00:00 · 2025-10-15 18:36:28 +00:00 · 2025-10-15 18:35:55 +00:00
1 changed files with 69 additions and 107 deletions
--- a/trl/trainer/sft_trainer.py
+++ b/trl/trainer/sft_trainer.py
@ -14,7 +14,6 @@

 import contextlib
 import os
-import re
 from collections import defaultdict
 from dataclasses import dataclass
 from pathlib import Path
@ -81,60 +80,6 @@ def get_dataset_column_names(dataset: Union[Dataset, IterableDataset]) -> list[s
    return list(next(iter(dataset)).keys()) if dataset.column_names is None else dataset.column_names


-def convert_to_structured_content(messages: list[dict[str, Any]], images: list, videos: list) -> list[dict[str, Any]]:
-    """
-    Convert messages with <image> and <video> placeholder tags to structured content format.
-
-    This format is required by some VLM processors (like Qwen) that expect typed content objects rather than plain text
-    with placeholder tags.
-
-    Args:
-        messages: List of message dicts with role and content
-        images: List of image paths/objects corresponding to <image> tags
-        videos: List of video paths/objects corresponding to <video> tags
-
-    Returns:
-        List of messages with structured content format
-
-    Example:
-        Input: {"role": "user", "content": "<video>\nWhat's happening?"} Output: {"role": "user", "content": [
-                    {"type": "video", "video": "/path/to/video.mp4"}, {"type": "text", "text": "What's happening?"}
-                ]}
-    """
-    structured_messages = []
-    image_idx = 0
-    video_idx = 0
-
-    for msg in messages:
-        role = msg["role"]
-        content_str = msg["content"]
-
-        # Check if this message contains media placeholders
-        if "<video>" in content_str or "<image>" in content_str:
-            # Parse placeholders and create structured content
-            content = []
-            parts = re.split(r"(<video>|<image>)", content_str)
-
-            for part in parts:
-                if part == "<video>":
-                    if video_idx < len(videos):
-                        content.append({"type": "video", "video": videos[video_idx]})
-                        video_idx += 1
-                elif part == "<image>":
-                    if image_idx < len(images):
-                        content.append({"type": "image", "image": images[image_idx]})
-                        image_idx += 1
-                elif part.strip():
-                    content.append({"type": "text", "text": part.strip()})
-
-            structured_messages.append({"role": role, "content": content})
-        else:
-            # No media placeholders - keep as plain text
-            structured_messages.append({"role": role, "content": content_str})
-
-    return structured_messages
-
-
@dataclass
 class DataCollatorForLanguageModeling(DataCollatorMixin):
    """
@ -401,62 +346,75 @@ class DataCollatorForVisionLanguageModeling(DataCollatorMixin):
        else:
            raise KeyError(f"Unexpected input keys in examples: {list(examples[0].keys())}.")

+    def _has_structured_content(self, messages: list[dict]) -> tuple[bool, bool]:
+        """
+        Check if messages contain structured content with images or videos.
+
+        Returns:
+            tuple[bool, bool]: (has_image_content, has_video_content)
+        """
+        has_image_content = False
+        has_video_content = False
+
+        if messages and isinstance(messages, list):
+            for msg in messages:
+                if isinstance(msg.get("content"), list):
+                    for item in msg["content"]:
+                        if isinstance(item, dict):
+                            if item.get("type") == "image":
+                                has_image_content = True
+                            elif item.get("type") == "video":
+                                has_video_content = True
+                if has_image_content and has_video_content:
+                    break
+
+        return has_image_content, has_video_content
+
    def _collate_language_modeling(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
-        # Handle images
+        # Extract images and videos from examples
        images = [example.get("images", []) for example in examples]
-        # Transformers requires at least one image in the batch, otherwise it throws an error
-        if all(img_list == [] for img_list in images):
-            images = None
-
-        # Handle videos
        videos = [example.get("videos", []) for example in examples]
-        if all(vid_list == [] for vid_list in videos):
-            videos = None
+        images = None if all(img == [] for img in images) else images
+        videos = None if all(vid == [] for vid in videos) else videos

-        if "messages" in examples[0]:  # conversational case
-            messages_list = []
-            for example in examples:
-                num_images = len(example.get("images", []))
-                num_videos = len(example.get("videos", []))
+        # Apply chat template for conversational data
+        if "messages" in examples[0]:
+            messages_list = [example["messages"] for example in examples]
+            # Check if messages use structured content format ({"type": "image"} or {"type": "video"})
+            has_image_content, has_video_content = self._has_structured_content(messages_list[0])

-                # Use structured content format when we have any media (images or videos)
-                # This format works for processors like Qwen that expect typed content objects
-                if num_videos > 0 or num_images > 0:
-                    structured_messages = convert_to_structured_content(
-                        example["messages"], example.get("images", []), example.get("videos", [])
-                    )
-                    messages_list.append(structured_messages)
-                else:
-                    # No media - keep original messages
-                    messages_list.append(example["messages"])
-
-            texts = self.processor.apply_chat_template(messages_list)
-        elif self.dataset_text_field in examples[0]:  # standard case
+            # For structured content, pass images/videos to apply_chat_template for extraction
+            template_kwargs = {}
+            if has_image_content and images:
+                template_kwargs["images"] = images
+            if has_video_content and videos:
+                template_kwargs["videos"] = videos
+            texts = self.processor.apply_chat_template(messages_list, **template_kwargs)
+        elif self.dataset_text_field in examples[0]:
            texts = [example[self.dataset_text_field] for example in examples]
+            has_image_content = has_video_content = False
        else:
            raise KeyError(
-                "The input examples must contain either 'messages' for conversational data or 'text' for standard "
-                "data."
+                "The input examples must contain either 'messages' for conversational data or 'text' for standard data."
            )

-        # Process with both images and videos
+        # Build processor kwargs
        processor_kwargs = {
            "text": texts,
            "padding": True,
            "padding_side": "right",
            "pad_to_multiple_of": self.pad_to_multiple_of,
            "return_tensors": self.return_tensors,
-            "add_special_tokens": False,  # to avoid adding the BOS, twice see https://huggingface.co/blog/qgallouedec/gotchas-in-tokenizer-behavior#7-chat-template-and-tokenization-dont-compose-due-to-special-tokens
+            "add_special_tokens": False,
        }
-        # Pass truncation parameters to processor if max_length is set
-        # The processor will handle truncation appropriately for both images and videos
        if self.max_length is not None:
            processor_kwargs["truncation"] = True
            processor_kwargs["max_length"] = self.max_length

-        if images is not None:
+        # Add images/videos to processor only if not already in structured content
+        if images and not has_image_content:
            processor_kwargs["images"] = images
-        if videos is not None:
+        if videos and not has_video_content:
            processor_kwargs["videos"] = videos

        output = self.processor(**processor_kwargs)
@ -475,25 +433,28 @@ class DataCollatorForVisionLanguageModeling(DataCollatorMixin):
                "Padding to a multiple of a value is not yet implemented for vision-language modeling and "
                "prompt-completion data yet."
            )
-        # Handle images
+        # Extract images and videos from examples
        images = [example.get("images", []) for example in examples]
-        # Transformers requires at least one image in the batch, otherwise it throws an error
-        if all(img_list == [] for img_list in images):
-            images = None
-
-        # Handle videos
        videos = [example.get("videos", []) for example in examples]
-        if all(vid_list == [] for vid_list in videos):
-            videos = None
+        images = None if all(img == [] for img in images) else images
+        videos = None if all(vid == [] for vid in videos) else videos
+
+        # Apply chat template for conversational data
+        if is_conversational(examples[0]):
+            # Check if messages use structured content format
+            first_prompt_completion = examples[0]["prompt"] + examples[0]["completion"]
+            has_image_content, has_video_content = self._has_structured_content(first_prompt_completion)
+
+            # For non-structured content, add image placeholders (videos require structured content)
+            if not (has_image_content or has_video_content):
+                for example in examples:
+                    num_images = len(example.get("images", []))
+                    if num_images > 0 and not example.get("videos"):
+                        prepare_multimodal_messages(example["prompt"] + example["completion"], num_images=num_images)

-        if is_conversational(examples[0]):  # conversational case
-            for example in examples:
-                num_images = len(example.get("images", []))
-                num_videos = len(example.get("videos", []))
-                # Only prepare multimodal messages for images; videos use native <video> tags
-                if num_images > 0 and num_videos == 0:
-                    prepare_multimodal_messages(example["prompt"] + example["completion"], num_images=num_images)
            examples = [apply_chat_template(example, self.processor) for example in examples]
+        else:
+            has_image_content = has_video_content = False

        prompts = [example["prompt"] for example in examples]
        completions = [example["completion"] for example in examples]
@ -504,11 +465,12 @@ class DataCollatorForVisionLanguageModeling(DataCollatorMixin):
            "padding": True,
            "padding_side": "left",
            "return_tensors": self.return_tensors,
-            "add_special_tokens": False,  # to avoid adding the BOS, twice see https://huggingface.co/blog/qgallouedec/gotchas-in-tokenizer-behavior#7-chat-template-and-tokenization-dont-compose-due-to-special-tokens
+            "add_special_tokens": False,
        }
-        if images is not None:
+        # Add images/videos to processor only if not already in structured content
+        if images and not has_image_content:
            prompt_kwargs["images"] = images
-        if videos is not None:
+        if videos and not has_video_content:
            prompt_kwargs["videos"] = videos

        processed_prompts = self.processor(**prompt_kwargs)
Author	SHA1	Message	Date
Kashif Rasul	043b223d34	helper for structured data	2025-10-15 19:18:53 +00:00
Kashif Rasul	9263a16ed5	remove unused import	2025-10-15 18:36:28 +00:00
Kashif Rasul	04cf031330	remove formatting to user side	2025-10-15 18:35:55 +00:00