🚨 [unbloating] unify TypedDict usage in processing (#40931)

* just squash commits into one

* fix style
This commit is contained in:
Raushan Turganbay
2025-10-03 14:17:59 +02:00
committed by GitHub
parent 42bcc81ba2
commit 5339f72b9b
208 changed files with 1578 additions and 1988 deletions

View File

@ -292,7 +292,7 @@ The `@auto_docstring` decorator automatically generates docstrings by:
8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
Currently only supported for [`FastImageProcessorKwargs`].
Currently only supported for [`ImagesKwargs`].
## Best practices

View File

@ -20,7 +20,8 @@ import numpy as np
from .image_processing_base import BatchFeature, ImageProcessingMixin
from .image_transforms import center_crop, normalize, rescale
from .image_utils import ChannelDimension, get_image_size
from .image_utils import ChannelDimension, ImageInput, get_image_size
from .processing_utils import ImagesKwargs, Unpack
from .utils import logging
from .utils.import_utils import requires
@ -36,6 +37,8 @@ INIT_SERVICE_KWARGS = [
@requires(backends=("vision",))
class BaseImageProcessor(ImageProcessingMixin):
valid_kwargs = ImagesKwargs
def __init__(self, **kwargs):
super().__init__(**kwargs)
@ -46,9 +49,9 @@ class BaseImageProcessor(ImageProcessingMixin):
"""
return False
def __call__(self, images, **kwargs) -> BatchFeature:
def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
"""Preprocess an image or a batch of images."""
return self.preprocess(images, **kwargs)
return self.preprocess(images, *args, **kwargs)
def preprocess(self, images, **kwargs) -> BatchFeature:
raise NotImplementedError("Each image processor must implement its own preprocess method")

View File

@ -15,7 +15,7 @@
from collections.abc import Iterable
from copy import deepcopy
from functools import lru_cache, partial
from typing import Any, Optional, TypedDict, Union
from typing import Any, Optional, Union
import numpy as np
@ -40,7 +40,7 @@ from .image_utils import (
validate_kwargs,
validate_preprocess_arguments,
)
from .processing_utils import Unpack
from .processing_utils import ImagesKwargs, Unpack
from .utils import (
TensorType,
auto_docstring,
@ -163,28 +163,6 @@ def divide_to_patches(
return patches
class DefaultFastImageProcessorKwargs(TypedDict, total=False):
do_resize: Optional[bool]
size: Optional[dict[str, int]]
default_to_square: Optional[bool]
resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
do_center_crop: Optional[bool]
crop_size: Optional[dict[str, int]]
do_rescale: Optional[bool]
rescale_factor: Optional[Union[int, float]]
do_normalize: Optional[bool]
image_mean: Optional[Union[float, list[float]]]
image_std: Optional[Union[float, list[float]]]
do_pad: Optional[bool]
pad_size: Optional[dict[str, int]]
do_convert_rgb: Optional[bool]
return_tensors: Optional[Union[str, TensorType]]
data_format: Optional[ChannelDimension]
input_data_format: Optional[Union[str, ChannelDimension]]
device: Optional["torch.device"]
disable_grouping: Optional[bool]
@auto_docstring
class BaseImageProcessorFast(BaseImageProcessor):
resample = None
@ -206,10 +184,10 @@ class BaseImageProcessorFast(BaseImageProcessor):
input_data_format = None
device = None
model_input_names = ["pixel_values"]
valid_kwargs = DefaultFastImageProcessorKwargs
valid_kwargs = ImagesKwargs
unused_kwargs = None
def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[ImagesKwargs]):
super().__init__(**kwargs)
kwargs = self.filter_out_unused_kwargs(kwargs)
size = kwargs.pop("size", self.size)
@ -728,11 +706,8 @@ class BaseImageProcessorFast(BaseImageProcessor):
data_format=data_format,
)
def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
return self.preprocess(images, *args, **kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
# Set default kwargs from self. This ensures that if a kwarg is not provided
@ -765,7 +740,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
do_convert_rgb: bool,
input_data_format: ChannelDimension,
device: Optional[Union[str, "torch.device"]] = None,
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
**kwargs: Unpack[ImagesKwargs],
) -> BatchFeature:
"""
Preprocess image-like inputs.

View File

@ -959,8 +959,6 @@ class AriaProcessor(ProcessorMixin):
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
images: Optional[ImageInput] = None,
audio=None,
videos=None,
**kwargs: Unpack[AriaProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -85,8 +85,6 @@ class AriaProcessor(ProcessorMixin):
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
images: Optional[ImageInput] = None,
audio=None,
videos=None,
**kwargs: Unpack[AriaProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -19,18 +19,11 @@ import numpy as np
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput, make_flat_list_of_images
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
class AyaVisionImagesKwargs(ImagesKwargs, total=False):
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: AyaVisionImagesKwargs
_defaults = {
"text_kwargs": {
"padding_side": "left",
@ -140,8 +133,6 @@ class AyaVisionProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[AyaVisionProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -33,6 +33,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
filter_out_non_signature_kwargs,
@ -54,6 +55,17 @@ if is_torch_available():
logger = logging.get_logger(__name__)
class BeitImageProcessorKwargs(ImagesKwargs):
r"""
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""
do_reduce_labels: Optional[bool]
@requires(backends=("vision",))
class BeitImageProcessor(BaseImageProcessor):
r"""
@ -99,6 +111,7 @@ class BeitImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = BeitImageProcessorKwargs
@filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS)
def __init__(

View File

@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -40,17 +39,7 @@ from ...utils import (
TensorType,
auto_docstring,
)
class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""
do_reduce_labels: Optional[bool]
from .image_processing_beit import BeitImageProcessorKwargs
@auto_docstring
@ -66,9 +55,9 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
do_rescale = True
do_normalize = True
do_reduce_labels = False
valid_kwargs = BeitFastImageProcessorKwargs
valid_kwargs = BeitImageProcessorKwargs
def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[BeitImageProcessorKwargs]):
super().__init__(**kwargs)
def reduce_label(self, labels: list["torch.Tensor"]):
@ -86,7 +75,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
self,
images: ImageInput,
segmentation_maps: Optional[ImageInput] = None,
**kwargs: Unpack[BeitFastImageProcessorKwargs],
**kwargs: Unpack[BeitImageProcessorKwargs],
) -> BatchFeature:
r"""
segmentation_maps (`ImageInput`, *optional*):
@ -101,7 +90,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
do_convert_rgb: bool,
input_data_format: ChannelDimension,
device: Optional[Union[str, "torch.device"]] = None,
**kwargs: Unpack[BeitFastImageProcessorKwargs],
**kwargs: Unpack[BeitImageProcessorKwargs],
) -> BatchFeature:
"""
Preprocess image-like inputs.

View File

@ -36,7 +36,6 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False):
"return_length": False,
"verbose": True,
},
"images_kwargs": {},
}
@ -67,8 +66,6 @@ class BlipProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[BlipProcessorKwargs],
) -> BatchEncoding:
"""

View File

@ -41,7 +41,6 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
"return_length": False,
"verbose": True,
},
"images_kwargs": {},
}
@ -81,8 +80,6 @@ class Blip2Processor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Blip2ProcessorKwargs],
) -> BatchEncoding:
"""

View File

@ -35,6 +35,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
@ -122,6 +123,10 @@ def get_resize_output_image_size(
return new_height, new_width
class BridgeTowerImageProcessorKwargs(ImagesKwargs):
size_divisor: Optional[int]
class BridgeTowerImageProcessor(BaseImageProcessor):
r"""
Constructs a BridgeTower image processor.
@ -169,6 +174,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = BridgeTowerImageProcessorKwargs
def __init__(
self,

View File

@ -23,7 +23,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
ImageInput,
SizeDict,
TensorType,
@ -33,6 +32,7 @@ from ...image_processing_utils_fast import (
)
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
from ...utils import auto_docstring
from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs
def make_pixel_mask(
@ -85,17 +85,6 @@ def get_resize_output_image_size(
return new_height, new_width
class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
Args:
size_divisor (`int`, *optional*, defaults to 32):
The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
"""
size_divisor: Optional[int]
@auto_docstring
class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BICUBIC
@ -110,14 +99,14 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
do_normalize = True
do_pad = True
size_divisor = 32
valid_kwargs = BridgeTowerFastImageProcessorKwargs
valid_kwargs = BridgeTowerImageProcessorKwargs
model_input_names = ["pixel_values", "pixel_mask"]
def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def resize(

View File

@ -16,17 +16,10 @@
Processor class for BridgeTower.
"""
from typing import Optional
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
class BridgeTowerImagesKwargs(ImagesKwargs):
size_divisor: Optional[int]
from ...processing_utils import ProcessingKwargs, ProcessorMixin
class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: BridgeTowerImagesKwargs
_defaults = {
"text_kwargs": {
"add_special_tokens": True,

View File

@ -92,8 +92,6 @@ class ChameleonProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[ChameleonProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -27,18 +27,13 @@ import torch
from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
from ...processing_utils import Unpack
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring
class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
"""
crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the

View File

@ -30,8 +30,10 @@ from transformers.models.aya_vision.modeling_aya_vision import (
from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
from ...cache_utils import Cache
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...processing_utils import Unpack
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TransformersKwargs, auto_docstring, logging
from ...utils.generic import check_model_inputs
from .configuration_cohere2_vision import Cohere2VisionConfig
@ -301,6 +303,24 @@ def get_optimal_tiled_canvas(
return best_grid
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
"""
crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
`preprocess` method.
min_patches (`int`, *optional*, defaults to 1):
The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
max_patches (`int`, *optional*, defaults to 12):
The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
"""
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
@auto_docstring
class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
size = {"height": 512, "width": 512}
@ -308,6 +328,14 @@ class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
max_patches = 12
crop_to_patches = True
patch_size = 16
valid_kwargs = Cohere2VisionFastImageProcessorKwargs
def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
__all__ = [

View File

@ -19,16 +19,11 @@ import numpy as np
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
class Cohere2VisionImagesKwargs(ImagesKwargs, total=False):
max_patches: Optional[int]
class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Cohere2VisionImagesKwargs
_defaults = {
"text_kwargs": {
"padding_side": "left",

View File

@ -90,8 +90,6 @@ class ColPaliProcessor(PaliGemmaProcessor):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[ColPaliProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -131,8 +131,6 @@ class ColPaliProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[ColPaliProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -93,8 +93,6 @@ class ColQwen2Processor(ColPaliProcessor):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[ColQwen2ProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -94,8 +94,6 @@ class ColQwen2Processor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[ColQwen2ProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -53,6 +53,7 @@ from ...image_utils import (
validate_kwargs,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
is_scipy_available,
@ -774,6 +775,29 @@ def compute_segments(
return segmentation, segments
class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]]
@requires(backends=("vision",))
class ConditionalDetrImageProcessor(BaseImageProcessor):
r"""
@ -829,6 +853,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = ConditionalDetrImageProcessorKwargs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
def __init__(

View File

@ -15,7 +15,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
SizeDict,
get_image_size_for_max_height_width,
get_max_height_width,
@ -37,6 +36,7 @@ from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, logging
from ...utils.import_utils import requires
from .image_processing_conditional_detr import (
ConditionalDetrImageProcessorKwargs,
compute_segments,
convert_segmentation_to_rle,
get_size_with_aspect_ratio,
@ -46,24 +46,6 @@ from .image_processing_conditional_detr import (
logger = logging.get_logger(__name__)
class ConditionalDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@ -278,9 +260,9 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
size = {"shortest_edge": 800, "longest_edge": 1333}
default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = ConditionalDetrFastImageProcessorKwargs
valid_kwargs = ConditionalDetrImageProcessorKwargs
def __init__(self, **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs]) -> None:
def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> None:
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@ -542,25 +524,8 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs],
**kwargs: Unpack[ConditionalDetrImageProcessorKwargs],
) -> BatchFeature:
r"""
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
List of annotations associated with the image or batch of images. If annotation is for object
detection, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
dictionary. An image can have no annotations, in which case the list should be empty.
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
An image can have no segments, in which case the list should be empty.
- "file_name" (`str`): The file name of the image.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
logger.warning_once(
@ -575,7 +540,7 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
)
kwargs["size"] = kwargs.pop("max_size")
return super().preprocess(images, annotations, masks_path, **kwargs)
return super().preprocess(images, **kwargs)
def _preprocess(
self,

View File

@ -38,6 +38,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
from ...utils.import_utils import requires
@ -49,6 +50,16 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class ConvNextImageProcessorKwargs(ImagesKwargs):
"""
crop_pct (`float`, *optional*):
Percentage of the image to crop. Only has an effect if size < 384. Can be
overridden by `crop_pct` in the`preprocess` method.
"""
crop_pct: Optional[float]
@requires(backends=("vision",))
class ConvNextImageProcessor(BaseImageProcessor):
r"""
@ -87,6 +98,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = ConvNextImageProcessorKwargs
def __init__(
self,

View File

@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -39,16 +38,7 @@ from ...utils import (
TensorType,
auto_docstring,
)
class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
crop_pct (`float`, *optional*):
Percentage of the image to crop. Only has an effect if size < 384. Can be
overridden by `crop_pct` in the`preprocess` method.
"""
crop_pct: Optional[float]
from .image_processing_convnext import ConvNextImageProcessorKwargs
@auto_docstring
@ -62,13 +52,13 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
do_rescale = True
do_normalize = True
crop_pct = 224 / 256
valid_kwargs = ConvNextFastImageProcessorKwargs
valid_kwargs = ConvNextImageProcessorKwargs
def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[ConvNextImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def resize(

View File

@ -246,9 +246,7 @@ class CsmProcessor(ProcessorMixin):
text_kwargs = output_kwargs["text_kwargs"]
audio_kwargs = output_kwargs["audio_kwargs"]
common_kwargs = output_kwargs["common_kwargs"]
return_tensors = common_kwargs.pop("return_tensors", None)
return_tensors = text_kwargs.get("return_tensors", None)
if return_tensors != "pt":
raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")

View File

@ -38,6 +38,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
@ -48,6 +49,16 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class DeepseekVLImageProcessorKwargs(ImagesKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""
min_size: int
class DeepseekVLImageProcessor(BaseImageProcessor):
r"""
Constructs a DEEPSEEK_VL image processor.
@ -90,6 +101,8 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
model_input_names = ["pixel_values"]
valid_kwargs = DeepseekVLImageProcessorKwargs
def __init__(
self,
do_resize: bool = True,

View File

@ -24,25 +24,11 @@ import torch
import torch.nn.functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring
class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""
min_size: int
from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs
@auto_docstring
@ -56,9 +42,9 @@ class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
do_rescale = True
do_normalize = True
do_pad = True
valid_kwargs = DeepseekVLFastImageProcessorKwargs
valid_kwargs = DeepseekVLImageProcessorKwargs
def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[DeepseekVLImageProcessorKwargs]):
super().__init__(**kwargs)
if kwargs.get("image_mean") is None:
background_color = (127, 127, 127)

View File

@ -39,6 +39,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
@ -49,6 +50,32 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
method.
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
overridden by the `high_res_resample` parameter in the `preprocess` method.
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""
min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
class DeepseekVLHybridImageProcessor(BaseImageProcessor):
r"""
Constructs a DEEPSEEK_VL_HYBRID image processor.
@ -102,6 +129,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "high_res_pixel_values"]
valid_kwargs = DeepseekVLHybridImageProcessorKwargs
def __init__(
self,

View File

@ -26,7 +26,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
get_size_dict,
group_images_by_shape,
reorder_images,
@ -41,32 +40,7 @@ from ...image_utils import (
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
method.
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
overridden by the `high_res_resample` parameter in the `preprocess` method.
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""
min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs
@auto_docstring
@ -80,14 +54,14 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
do_rescale = True
do_normalize = True
do_pad = True
valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs
valid_kwargs = DeepseekVLHybridImageProcessorKwargs
high_res_image_mean = OPENAI_CLIP_MEAN
high_res_image_std = OPENAI_CLIP_STD
high_res_size = {"height": 1024, "width": 1024}
high_res_resample = PILImageResampling.BICUBIC
model_input_names = ["pixel_values", "high_res_pixel_values"]
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
if kwargs.get("image_mean") is None:
background_color = (127, 127, 127)
else:

View File

@ -22,7 +22,6 @@ from ...cache_utils import Cache
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
get_size_dict,
group_images_by_shape,
reorder_images,
@ -43,7 +42,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import Unpack
from ...processing_utils import ImagesKwargs, Unpack
from ...tokenization_utils_base import (
PreTokenizedInput,
TextInput,
@ -430,6 +429,32 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
return model_inputs
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
method.
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
overridden by the `high_res_resample` parameter in the `preprocess` method.
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""
min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
r"""
Constructs a DEEPSEEK_VL_HYBRID image processor.
@ -483,6 +508,7 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
"""
model_input_names = ["pixel_values", "high_res_pixel_values"]
valid_kwargs = DeepseekVLHybridImageProcessorKwargs
def __init__(
self,
@ -727,32 +753,6 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
return BatchFeature(data=data, tensor_type=return_tensors)
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
method.
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
overridden by the `high_res_resample` parameter in the `preprocess` method.
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""
min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
high_res_image_mean = OPENAI_CLIP_MEAN
high_res_image_std = OPENAI_CLIP_STD
@ -760,7 +760,7 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
high_res_resample = PILImageResampling.BICUBIC
model_input_names = ["pixel_values", "high_res_pixel_values"]
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
if kwargs.get("image_mean") is None:
background_color = (127, 127, 127)
else:

View File

@ -53,6 +53,7 @@ from ...image_utils import (
validate_kwargs,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
is_scipy_available,
@ -79,6 +80,30 @@ if is_scipy_available():
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
class DeformableDetrImageProcessorKwargs(ImagesKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]]
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@ -827,6 +852,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = DeformableDetrImageProcessorKwargs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
def __init__(

View File

@ -14,7 +14,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
SizeDict,
get_image_size_for_max_height_width,
get_max_height_width,
@ -35,29 +34,11 @@ from ...image_utils import (
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, logging
from ...utils.import_utils import requires
from .image_processing_deformable_detr import get_size_with_aspect_ratio
from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs, get_size_with_aspect_ratio
logger = logging.get_logger(__name__)
class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@ -272,9 +253,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
size = {"shortest_edge": 800, "longest_edge": 1333}
default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = DeformableDetrFastImageProcessorKwargs
valid_kwargs = DeformableDetrImageProcessorKwargs
def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None:
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@ -536,25 +517,8 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
**kwargs: Unpack[DeformableDetrImageProcessorKwargs],
) -> BatchFeature:
r"""
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
List of annotations associated with the image or batch of images. If annotation is for object
detection, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
dictionary. An image can have no annotations, in which case the list should be empty.
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
An image can have no segments, in which case the list should be empty.
- "file_name" (`str`): The file name of the image.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
logger.warning_once(
@ -569,7 +533,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
)
kwargs["size"] = kwargs.pop("max_size")
return super().preprocess(images, annotations, masks_path, **kwargs)
return super().preprocess(images, **kwargs)
def _preprocess(
self,

View File

@ -52,6 +52,7 @@ from ...image_utils import (
validate_kwargs,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
is_scipy_available,
@ -82,6 +83,29 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
class DetrImageProcessorKwargs(ImagesKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]]
# From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
"""
@ -811,6 +835,7 @@ class DetrImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = DetrImageProcessorKwargs
def __init__(
self,

View File

@ -28,7 +28,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
SizeDict,
get_image_size_for_max_height_width,
get_max_height_width,
@ -54,6 +53,7 @@ from ...utils import (
)
from ...utils.import_utils import requires
from .image_processing_detr import (
DetrImageProcessorKwargs,
compute_segments,
convert_segmentation_to_rle,
get_size_with_aspect_ratio,
@ -263,23 +263,6 @@ def prepare_coco_panoptic_annotation(
return new_target
class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
@auto_docstring
@requires(backends=("torchvision", "torch"))
class DetrImageProcessorFast(BaseImageProcessorFast):
@ -294,9 +277,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
size = {"shortest_edge": 800, "longest_edge": 1333}
default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = DetrFastImageProcessorKwargs
valid_kwargs = DetrImageProcessorKwargs
def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None:
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@ -558,25 +541,8 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[DetrFastImageProcessorKwargs],
**kwargs: Unpack[DetrImageProcessorKwargs],
) -> BatchFeature:
r"""
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
List of annotations associated with the image or batch of images. If annotation is for object
detection, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
dictionary. An image can have no annotations, in which case the list should be empty.
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
An image can have no segments, in which case the list should be empty.
- "file_name" (`str`): The file name of the image.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
logger.warning_once(
@ -591,7 +557,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
)
kwargs["size"] = kwargs.pop("max_size")
return super().preprocess(images, annotations, masks_path, **kwargs)
return super().preprocess(images, **kwargs)
def _preprocess(
self,

View File

@ -111,9 +111,7 @@ class DiaProcessor(ProcessorMixin):
text_kwargs = output_kwargs["text_kwargs"]
audio_kwargs = output_kwargs["audio_kwargs"]
common_kwargs = output_kwargs["common_kwargs"]
return_tensors = common_kwargs.pop("return_tensors", None)
return_tensors = text_kwargs.get("return_tensors", None)
if return_tensors != "pt":
raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")

View File

@ -40,6 +40,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, logging
from ...utils.import_utils import is_vision_available, requires
@ -51,6 +52,18 @@ if is_vision_available():
import PIL
class DonutImageProcessorKwargs(ImagesKwargs):
"""
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
Whether to resize the image using thumbnail method.
do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
"""
do_thumbnail: Optional[bool]
do_align_long_axis: Optional[bool]
@requires(backends=("vision",))
class DonutImageProcessor(BaseImageProcessor):
r"""
@ -90,6 +103,7 @@ class DonutImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = DonutImageProcessorKwargs
def __init__(
self,

View File

@ -19,7 +19,7 @@ from typing import Optional, Union
import torch
from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
from ...processing_utils import Unpack
@ -28,24 +28,12 @@ from ...utils import (
auto_docstring,
logging,
)
from .image_processing_donut import DonutImageProcessorKwargs
logger = logging.get_logger(__name__)
class DonutFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
Args:
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
Whether to resize the image using thumbnail method.
do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
"""
do_thumbnail: Optional[bool]
do_align_long_axis: Optional[bool]
@auto_docstring
class DonutImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
@ -58,9 +46,9 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
do_thumbnail = True
do_align_long_axis = False
do_pad = True
valid_kwargs = DonutFastImageProcessorKwargs
valid_kwargs = DonutImageProcessorKwargs
def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[DonutImageProcessorKwargs]):
size = kwargs.pop("size", None)
if isinstance(size, (tuple, list)):
size = size[::-1]
@ -68,7 +56,7 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutFastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutImageProcessorKwargs]) -> BatchFeature:
if "size" in kwargs:
size = kwargs.pop("size")
if isinstance(size, (tuple, list)):

View File

@ -74,8 +74,6 @@ class DonutProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[DonutProcessorKwargs],
):
"""

View File

@ -44,6 +44,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
filter_out_non_signature_kwargs,
@ -63,6 +64,26 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class DPTImageProcessorKwargs(ImagesKwargs):
"""
ensure_multiple_of (`int`, *optional*, defaults to 1):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
by `ensure_multiple_of` in `preprocess`.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
be overridden by `keep_aspect_ratio` in `preprocess`.
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""
ensure_multiple_of: Optional[int]
size_divisor: Optional[int]
keep_aspect_ratio: Optional[bool]
do_reduce_labels: Optional[bool]
def get_resize_output_image_size(
input_image: np.ndarray,
output_size: Union[int, Iterable[int]],
@ -151,6 +172,7 @@ class DPTImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = DPTImageProcessorKwargs
def __init__(
self,

View File

@ -28,7 +28,7 @@ import torch
from torchvision.transforms.v2 import functional as F
from ...image_processing_base import BatchFeature
from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
from ...image_processing_utils_fast import BaseImageProcessorFast
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
@ -41,35 +41,13 @@ from ...image_utils import (
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, requires_backends
from .image_processing_dpt import DPTImageProcessorKwargs
if TYPE_CHECKING:
from ...modeling_outputs import DepthEstimatorOutput
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
ensure_multiple_of (`int`, *optional*, defaults to 1):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
by `ensure_multiple_of` in `preprocess`.
size_divisor (`int`, *optional*):
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
DINOv2 paper, which uses the model in combination with DPT.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
be overridden by `keep_aspect_ratio` in `preprocess`.
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""
ensure_multiple_of: Optional[int]
size_divisor: Optional[int]
keep_aspect_ratio: Optional[bool]
do_reduce_labels: Optional[bool]
def get_resize_output_image_size(
input_image: "torch.Tensor",
output_size: Union[int, Iterable[int]],
@ -123,13 +101,13 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
do_normalize = True
do_reduce_labels = None
valid_kwargs = DPTFastImageProcessorKwargs
valid_kwargs = DPTImageProcessorKwargs
do_pad = False
rescale_factor = 1 / 255
ensure_multiple_of = 1
keep_aspect_ratio = False
def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[DPTImageProcessorKwargs]):
super().__init__(**kwargs)
def reduce_label(self, labels: list["torch.Tensor"]):
@ -147,7 +125,7 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
self,
images: ImageInput,
segmentation_maps: Optional[ImageInput] = None,
**kwargs: Unpack[DPTFastImageProcessorKwargs],
**kwargs: Unpack[DPTImageProcessorKwargs],
) -> BatchFeature:
r"""
segmentation_maps (`ImageInput`, *optional*):
@ -162,7 +140,7 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
do_convert_rgb: bool,
input_data_format: ChannelDimension,
device: Optional[Union[str, "torch.device"]] = None,
**kwargs: Unpack[DPTFastImageProcessorKwargs],
**kwargs: Unpack[DPTImageProcessorKwargs],
) -> BatchFeature:
"""
Preprocess image-like inputs.

View File

@ -21,7 +21,7 @@ from typing import TYPE_CHECKING, Optional, Union
import torch
from ...image_processing_base import BatchFeature
from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
from ...image_processing_utils_fast import BaseImageProcessorFast
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
@ -35,6 +35,7 @@ from ...utils import (
requires_backends,
)
from ..beit.image_processing_beit_fast import BeitImageProcessorFast
from .image_processing_dpt import DPTImageProcessorKwargs
if TYPE_CHECKING:
@ -82,29 +83,6 @@ def get_resize_output_image_size(
return SizeDict(height=new_height, width=new_width)
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
ensure_multiple_of (`int`, *optional*, defaults to 1):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
by `ensure_multiple_of` in `preprocess`.
size_divisor (`int`, *optional*):
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
DINOv2 paper, which uses the model in combination with DPT.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
be overridden by `keep_aspect_ratio` in `preprocess`.
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""
ensure_multiple_of: Optional[int]
size_divisor: Optional[int]
keep_aspect_ratio: Optional[bool]
do_reduce_labels: Optional[bool]
@auto_docstring
class DPTImageProcessorFast(BeitImageProcessorFast):
resample = PILImageResampling.BICUBIC
@ -123,7 +101,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
do_center_crop = None
do_reduce_labels = None
valid_kwargs = DPTFastImageProcessorKwargs
valid_kwargs = DPTImageProcessorKwargs
def resize(
self,

View File

@ -34,6 +34,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, logging, requires_backends
@ -49,6 +50,15 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
r"""
do_grayscale (`bool`, *optional*, defaults to `True`):
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
"""
do_grayscale: Optional[bool] = True
# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
def is_grayscale(
image: np.ndarray,
@ -155,6 +165,7 @@ class EfficientLoFTRImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = EfficientLoFTRImageProcessorKwargs
def __init__(
self,

View File

@ -22,7 +22,6 @@ from PIL import Image, ImageDraw
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -40,6 +39,7 @@ from ...utils import (
TensorType,
auto_docstring,
)
from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs
if TYPE_CHECKING:
@ -108,15 +108,6 @@ def convert_to_grayscale(
return F.rgb_to_grayscale(image, num_output_channels=3)
class EfficientLoFTRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
do_grayscale (`bool`, *optional*, defaults to `True`):
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
"""
do_grayscale: Optional[bool] = True
@auto_docstring
class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
@ -126,13 +117,13 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
do_rescale = True
rescale_factor = 1 / 255
do_normalize = None
valid_kwargs = EfficientLoFTRFastImageProcessorKwargs
valid_kwargs = EfficientLoFTRImageProcessorKwargs
def __init__(self, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def _prepare_images_structure(

View File

@ -33,6 +33,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
@ -43,6 +44,18 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class EfficientNetImageProcessorKwargs(ImagesKwargs):
"""
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
include_top (`bool`, *optional*, defaults to `self.include_top`):
Normalize the image again with the standard deviation only for image classification if set to True.
"""
rescale_offset: bool
include_top: bool
class EfficientNetImageProcessor(BaseImageProcessor):
r"""
Constructs a EfficientNet image processor.
@ -83,6 +96,7 @@ class EfficientNetImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = EfficientNetImageProcessorKwargs
def __init__(
self,

View File

@ -20,7 +20,7 @@ from typing import Optional, Union
import torch
from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
from ...processing_utils import Unpack
@ -28,19 +28,7 @@ from ...utils import (
TensorType,
auto_docstring,
)
class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
Args:
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
include_top (`bool`, *optional*, defaults to `self.include_top`):
Normalize the image again with the standard deviation only for image classification if set to True.
"""
rescale_offset: bool
include_top: bool
from .image_processing_efficientnet import EfficientNetImageProcessorKwargs
@auto_docstring
@ -57,9 +45,9 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast):
rescale_offset = False
do_normalize = True
include_top = True
valid_kwargs = EfficientNetFastImageProcessorKwargs
valid_kwargs = EfficientNetImageProcessorKwargs
def __init__(self, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[EfficientNetImageProcessorKwargs]):
super().__init__(**kwargs)
def rescale(
@ -195,7 +183,7 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast):
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)

View File

@ -37,6 +37,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, is_vision_available, logging
@ -46,6 +47,11 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class Emu3ImageProcessorKwargs(ImagesKwargs):
ratio: Optional[str]
image_area: Optional[int]
def smart_resize(
height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
):
@ -108,6 +114,7 @@ class Emu3ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "image_sizes"]
valid_kwargs = Emu3ImageProcessorKwargs
def __init__(
self,

View File

@ -20,7 +20,7 @@ import numpy as np
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import is_vision_available
@ -33,14 +33,8 @@ class Emu3TextKwargs(TextKwargs, total=False):
return_for_image_generation: bool
class Emu3ImagesKwargs(ImagesKwargs, total=False):
ratio: str
image_area: int
class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
text_kwargs: Emu3TextKwargs
images_kwargs: Emu3ImagesKwargs
_defaults = {
"text_kwargs": {
"return_for_image_generation": False,
@ -95,8 +89,6 @@ class Emu3Processor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Emu3ProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -36,6 +36,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
@ -53,6 +54,21 @@ if is_torch_available():
import torch.nn.functional as F
class EomtImageProcessorKwargs(ImagesKwargs):
"""
do_split_image (`bool`, *optional*, defaults to `False`):
Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
Otherwise, the input images will be padded to the target size.
ignore_index (`int`, *optional*):
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
denoted with 0 (background) will be replaced with `ignore_index`.
"""
do_split_image: bool
ignore_index: Optional[int] = None
# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
def convert_segmentation_map_to_binary_masks(
segmentation_map: np.ndarray,

View File

@ -24,7 +24,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -43,6 +42,7 @@ from ...utils import (
filter_out_non_signature_kwargs,
)
from .image_processing_eomt import (
EomtImageProcessorKwargs,
compute_segments,
convert_segmentation_map_to_binary_masks,
get_size_with_aspect_ratio,
@ -50,25 +50,6 @@ from .image_processing_eomt import (
)
class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs):
"""
do_split_image (`bool`, *optional*, defaults to `False`):
Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
Otherwise, the input images will be padded to the target size.
do_pad (`bool`, *optional*, defaults to `False`):
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
ignore_index (`int`, *optional*):
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
denoted with 0 (background) will be replaced with `ignore_index`.
"""
do_split_image: bool
do_pad: bool
ignore_index: Optional[int] = None
def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
"""Returns the height and width from a size dict."""
target_height = size_dict["shortest_edge"]
@ -102,9 +83,9 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
do_split_image = False
do_pad = False
ignore_index = None
valid_kwargs = EomtImageProcessorFastKwargs
valid_kwargs = EomtImageProcessorKwargs
def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]):
def __init__(self, **kwargs: Unpack[EomtImageProcessorKwargs]):
super().__init__(**kwargs)
def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]:
@ -153,7 +134,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
images: ImageInput,
segmentation_maps: Optional[list[torch.Tensor]] = None,
instance_id_to_semantic_id: Optional[dict[int, int]] = None,
**kwargs: Unpack[EomtImageProcessorFastKwargs],
**kwargs: Unpack[EomtImageProcessorKwargs],
) -> BatchFeature:
r"""
segmentation_maps (`ImageInput`, *optional*):
@ -171,7 +152,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
do_convert_rgb: bool,
input_data_format: ChannelDimension,
device: Optional[Union[str, "torch.device"]] = None,
**kwargs: Unpack[EomtImageProcessorFastKwargs],
**kwargs: Unpack[EomtImageProcessorKwargs],
) -> BatchFeature:
"""
Preprocess image-like inputs.

View File

@ -37,6 +37,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
from ...utils.import_utils import requires
@ -56,6 +57,89 @@ FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
LOGIT_LAPLACE_EPS: float = 0.1
class FlavaImageProcessorKwargs(ImagesKwargs):
"""
return_image_mask (`bool`, *optional*, defaults to `False`):
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
input_size_patches (`int`, *optional*, defaults to 14):
Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
by the `input_size_patches` parameter in `preprocess`.
total_mask_patches (`int`, *optional*, defaults to 75):
Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
`preprocess`.
mask_group_min_patches (`int`, *optional*, defaults to 16):
Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
parameter in `preprocess`.
mask_group_max_patches (`int`, *optional*):
Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
parameter in `preprocess`.
mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
in `preprocess`.
mask_group_max_aspect_ratio (`float`, *optional*):
Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
in `preprocess`.
return_codebook_pixels (`bool`, *optional*, defaults to `False`):
Whether to return the codebook pixel values.
codebook_do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
parameter in `preprocess`. `codebook_size`.
codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
`preprocess`.
codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
parameter in `preprocess`.
codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input for codebook at the center. If the input size is smaller than
`codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
overridden by the `codebook_do_center_crop` parameter in `preprocess`.
codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Desired output size for codebook input when applying center-cropping. Can be overridden by the
`codebook_crop_size` parameter in `preprocess`.
codebook_do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
overridden by the `codebook_do_rescale` parameter in `preprocess`.
codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
`codebook_rescale_factor` parameter in `preprocess`.
codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
`codebook_do_map_pixels` parameter in `preprocess`.
codebook_do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
be overridden by the `codebook_do_normalize` parameter in `preprocess`.
codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
by the `codebook_image_mean` parameter in `preprocess`.
codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
be overridden by the `codebook_image_std` parameter in `preprocess`.
"""
# Mask related params
return_image_mask: Optional[bool]
input_size_patches: Optional[int]
total_mask_patches: Optional[int]
mask_group_min_patches: Optional[int]
mask_group_max_patches: Optional[int]
mask_group_min_aspect_ratio: Optional[float]
mask_group_max_aspect_ratio: Optional[float]
# Codebook related params
return_codebook_pixels: Optional[bool]
codebook_do_resize: Optional[bool]
codebook_size: Optional[bool]
codebook_resample: Optional[int]
codebook_do_center_crop: Optional[bool]
codebook_crop_size: Optional[int]
codebook_do_rescale: Optional[bool]
codebook_rescale_factor: Optional[Union[int, float]]
codebook_do_map_pixels: Optional[bool]
codebook_do_normalize: Optional[bool]
codebook_image_mean: Optional[Union[float, Iterable[float]]]
codebook_image_std: Optional[Union[float, Iterable[float]]]
# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
class FlavaMaskingGenerator:
def __init__(
@ -225,6 +309,7 @@ class FlavaImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = FlavaImageProcessorKwargs
def __init__(
self,

View File

@ -16,7 +16,6 @@
import math
import random
from collections.abc import Iterable
from functools import lru_cache
from typing import Any, Optional, Union
@ -26,7 +25,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
get_size_dict,
)
from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
@ -42,6 +40,7 @@ from .image_processing_flava import (
FLAVA_IMAGE_MEAN,
FLAVA_IMAGE_STD,
LOGIT_LAPLACE_EPS,
FlavaImageProcessorKwargs,
)
@ -121,90 +120,6 @@ class FlavaMaskingGenerator:
return mask
class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
Args:
return_image_mask (`bool`, *optional*, defaults to `False`):
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
input_size_patches (`int`, *optional*, defaults to 14):
Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
by the `input_size_patches` parameter in `preprocess`.
total_mask_patches (`int`, *optional*, defaults to 75):
Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
`preprocess`.
mask_group_min_patches (`int`, *optional*, defaults to 16):
Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
parameter in `preprocess`.
mask_group_max_patches (`int`, *optional*):
Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
parameter in `preprocess`.
mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
in `preprocess`.
mask_group_max_aspect_ratio (`float`, *optional*):
Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
in `preprocess`.
return_codebook_pixels (`bool`, *optional*, defaults to `False`):
Whether to return the codebook pixel values.
codebook_do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
parameter in `preprocess`. `codebook_size`.
codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
`preprocess`.
codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
parameter in `preprocess`.
codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input for codebook at the center. If the input size is smaller than
`codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
overridden by the `codebook_do_center_crop` parameter in `preprocess`.
codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Desired output size for codebook input when applying center-cropping. Can be overridden by the
`codebook_crop_size` parameter in `preprocess`.
codebook_do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
overridden by the `codebook_do_rescale` parameter in `preprocess`.
codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
`codebook_rescale_factor` parameter in `preprocess`.
codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
`codebook_do_map_pixels` parameter in `preprocess`.
codebook_do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
be overridden by the `codebook_do_normalize` parameter in `preprocess`.
codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
by the `codebook_image_mean` parameter in `preprocess`.
codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
be overridden by the `codebook_image_std` parameter in `preprocess`.
"""
# Mask related params
return_image_mask: Optional[bool]
input_size_patches: Optional[int]
total_mask_patches: Optional[int]
mask_group_min_patches: Optional[int]
mask_group_max_patches: Optional[int]
mask_group_min_aspect_ratio: Optional[float]
mask_group_max_aspect_ratio: Optional[float]
# Codebook related params
return_codebook_pixels: Optional[bool]
codebook_do_resize: Optional[bool]
codebook_size: Optional[bool]
codebook_resample: Optional[int]
codebook_do_center_crop: Optional[bool]
codebook_crop_size: Optional[int]
codebook_do_rescale: Optional[bool]
codebook_rescale_factor: Optional[Union[int, float]]
codebook_do_map_pixels: Optional[bool]
codebook_do_normalize: Optional[bool]
codebook_image_mean: Optional[Union[float, Iterable[float]]]
codebook_image_std: Optional[Union[float, Iterable[float]]]
@auto_docstring
class FlavaImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BICUBIC
@ -239,13 +154,13 @@ class FlavaImageProcessorFast(BaseImageProcessorFast):
codebook_do_normalize = True
codebook_image_mean = FLAVA_CODEBOOK_MEAN
codebook_image_std = FLAVA_CODEBOOK_STD
valid_kwargs = FlavaFastImageProcessorKwargs
valid_kwargs = FlavaImageProcessorKwargs
def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[FlavaImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[FlavaImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
@classmethod

View File

@ -17,39 +17,8 @@ Image/Text processor class for FLAVA
"""
import warnings
from collections.abc import Iterable
from typing import Optional, Union
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
class FlavaImagesKwargs(ImagesKwargs):
# Mask related params
return_image_mask: Optional[bool]
input_size_patches: Optional[int]
total_mask_patches: Optional[int]
mask_group_min_patches: Optional[int]
mask_group_max_patches: Optional[int]
mask_group_min_aspect_ratio: Optional[float]
mask_group_max_aspect_ratio: Optional[float]
# Codebook related params
return_codebook_pixels: Optional[bool]
codebook_do_resize: Optional[bool]
codebook_size: Optional[bool]
codebook_resample: Optional[int]
codebook_do_center_crop: Optional[bool]
codebook_crop_size: Optional[int]
codebook_do_rescale: Optional[bool]
codebook_rescale_factor: Optional[Union[int, float]]
codebook_do_map_pixels: Optional[bool]
codebook_do_normalize: Optional[bool]
codebook_image_mean: Optional[Union[float, Iterable[float]]]
codebook_image_std: Optional[Union[float, Iterable[float]]]
class FlavaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: FlavaImagesKwargs
_defaults = {}
from ...processing_utils import ProcessorMixin
class FlavaProcessor(ProcessorMixin):
@ -67,7 +36,6 @@ class FlavaProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "FlavaImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
valid_processor_kwargs = FlavaProcessorKwargs
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None

View File

@ -39,7 +39,6 @@ logger = logging.get_logger(__name__)
class Florence2ProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
"images_kwargs": {},
}

View File

@ -71,7 +71,6 @@ class FuyuProcessorKwargs(ProcessingKwargs, total=False):
"verbose": True,
"return_mm_token_type_ids": False,
},
"images_kwargs": {},
}
@ -487,8 +486,6 @@ class FuyuProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[FuyuProcessorKwargs],
) -> "FuyuBatchFeature":
"""

View File

@ -40,6 +40,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
@ -50,6 +51,24 @@ if is_vision_available():
import PIL
class Gemma3ImageProcessorKwargs(ImagesKwargs):
"""
do_pan_and_scan (`bool`, *optional*):
Whether to apply `pan_and_scan` to images.
pan_and_scan_min_crop_size (`int`, *optional*):
Minimum size of each crop in pan and scan.
pan_and_scan_max_num_crops (`int`, *optional*):
Maximum number of crops per image in pan and scan.
pan_and_scan_min_ratio_to_activate (`float`, *optional*):
Minimum aspect ratio to activate pan and scan.
"""
do_pan_and_scan: Optional[bool]
pan_and_scan_min_crop_size: Optional[int]
pan_and_scan_max_num_crops: Optional[int]
pan_and_scan_min_ratio_to_activate: Optional[float]
class Gemma3ImageProcessor(BaseImageProcessor):
r"""
Constructs a SigLIP image processor.
@ -91,6 +110,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "num_crops"]
valid_kwargs = Gemma3ImageProcessorKwargs
def __init__(
self,

View File

@ -24,7 +24,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -35,29 +34,12 @@ from ...utils import (
auto_docstring,
logging,
)
from .image_processing_gemma3 import Gemma3ImageProcessorKwargs
logger = logging.get_logger(__name__)
class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
do_pan_and_scan (`bool`, *optional*):
Whether to apply `pan_and_scan` to images.
pan_and_scan_min_crop_size (`int`, *optional*):
Minimum size of each crop in pan and scan.
pan_and_scan_max_num_crops (`int`, *optional*):
Maximum number of crops per image in pan and scan.
pan_and_scan_min_ratio_to_activate (`float`, *optional*):
Minimum aspect ratio to activate pan and scan.
"""
do_pan_and_scan: Optional[bool]
pan_and_scan_min_crop_size: Optional[int]
pan_and_scan_max_num_crops: Optional[int]
pan_and_scan_min_ratio_to_activate: Optional[float]
@auto_docstring
class Gemma3ImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
@ -73,9 +55,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
pan_and_scan_min_crop_size = None
pan_and_scan_max_num_crops = None
pan_and_scan_min_ratio_to_activate = None
valid_kwargs = Gemma3FastImageProcessorKwargs
valid_kwargs = Gemma3ImageProcessorKwargs
def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[Gemma3ImageProcessorKwargs]):
super().__init__(**kwargs)
def pan_and_scan_batched(
@ -167,7 +149,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
def preprocess(
self,
images: ImageInput,
**kwargs: Unpack[Gemma3FastImageProcessorKwargs],
**kwargs: Unpack[Gemma3ImageProcessorKwargs],
) -> BatchFeature:
return super().preprocess(images, **kwargs)

View File

@ -20,21 +20,12 @@ import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, make_nested_list_of_images
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import to_py_obj
class Gemma3ImagesKwargs(ImagesKwargs):
do_pan_and_scan: Optional[bool]
pan_and_scan_min_crop_size: Optional[int]
pan_and_scan_max_num_crops: Optional[int]
pan_and_scan_min_ratio_to_activate: Optional[float]
do_convert_rgb: Optional[bool]
class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Gemma3ImagesKwargs
_defaults = {
"text_kwargs": {
"padding": False,
@ -81,8 +72,6 @@ class Gemma3Processor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
videos=None,
audio=None,
**kwargs: Unpack[Gemma3ProcessorKwargs],
) -> BatchFeature:
if text is None and images is None:

View File

@ -19,21 +19,13 @@ import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, make_nested_list_of_images
from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
class Gemma3nImagesKwargs(ImagesKwargs):
do_convert_rgb: Optional[bool]
class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
audio_kwargs: AudioKwargs
images_kwargs: Gemma3nImagesKwargs
_defaults = {
"text_kwargs": {
"padding": False,
},
"text_kwargs": {"padding": False},
}
@ -101,7 +93,6 @@ class Gemma3nProcessor(ProcessorMixin):
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] = None,
videos=None,
**kwargs: Unpack[Gemma3nProcessorKwargs],
) -> BatchFeature:
if text is None and images is None and audio is None:

View File

@ -39,6 +39,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, logging
from ...video_utils import VideoInput
@ -46,6 +47,21 @@ from ...video_utils import VideoInput
logger = logging.get_logger(__name__)
class Glm4vImageProcessorKwargs(ImagesKwargs):
"""
patch_size (`int`, *optional*, defaults to 14):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to 2):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to 2):
The merge size of the vision encoder to llm encoder.
"""
patch_size: Optional[int]
temporal_patch_size: Optional[int]
merge_size: Optional[int]
def smart_resize(
num_frames: int,
height: int,
@ -120,6 +136,7 @@ class Glm4vImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "image_grid_thw"]
valid_kwargs = Glm4vImageProcessorKwargs
def __init__(
self,

View File

@ -24,7 +24,6 @@ from ...image_processing_utils import (
)
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -41,27 +40,12 @@ from ...utils import (
auto_docstring,
logging,
)
from .image_processing_glm4v import smart_resize
from .image_processing_glm4v import Glm4vImageProcessorKwargs, smart_resize
logger = logging.get_logger(__name__)
class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
patch_size (`int`, *optional*, defaults to 14):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to 2):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to 2):
The merge size of the vision encoder to llm encoder.
"""
patch_size: Optional[int]
temporal_patch_size: Optional[int]
merge_size: Optional[int]
@auto_docstring
class Glm4vImageProcessorFast(BaseImageProcessorFast):
do_resize = True
@ -75,10 +59,10 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
patch_size = 14
temporal_patch_size = 2
merge_size = 2
valid_kwargs = Glm4vFastImageProcessorKwargs
valid_kwargs = Glm4vImageProcessorKwargs
model_input_names = ["pixel_values", "image_grid_thw"]
def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[Glm4vImageProcessorKwargs]):
super().__init__(**kwargs)
if self.size is not None and (
self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
@ -205,7 +189,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
def preprocess(
self,
images: ImageInput,
**kwargs: Unpack[Glm4vFastImageProcessorKwargs],
**kwargs: Unpack[Glm4vImageProcessorKwargs],
) -> BatchFeature:
return super().preprocess(images, **kwargs)

View File

@ -32,7 +32,7 @@ from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutputWithPast
from ...modeling_rope_utils import rope_config_validation
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...processing_utils import ImagesKwargs, Unpack
from ...processing_utils import Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
from ...utils.generic import check_model_inputs
@ -52,7 +52,6 @@ from ..qwen2_5_vl.modeling_qwen2_5_vl import (
Qwen2_5_VLVisionAttention,
Qwen2_5_VLVisionBlock,
)
from ..qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLVideosProcessorKwargs
from ..qwen2_vl.processing_qwen2_vl import (
Qwen2_VLProcessor,
Qwen2_VLProcessorKwargs,
@ -1508,19 +1507,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
return image_counts, video_counts
class Glm4vVideosProcessorKwargs(Qwen2_5_VLVideosProcessorKwargs):
pass
class Glm4vImagesKwargs(ImagesKwargs):
patch_size: Optional[int]
temporal_patch_size: Optional[int]
merge_size: Optional[int]
class Glm4vProcessorKwargs(Qwen2_VLProcessorKwargs):
images_kwargs: Glm4vImagesKwargs
videos_kwargs: Glm4vVideosProcessorKwargs
_defaults = {
"text_kwargs": {
"padding": False,

View File

@ -24,7 +24,7 @@ import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import logging
from ...video_utils import VideoInput
@ -33,18 +33,7 @@ from ...video_utils import VideoInput
logger = logging.get_logger(__name__)
class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
fps: Union[list[float], float]
class Glm4vImagesKwargs(ImagesKwargs):
patch_size: Optional[int]
temporal_patch_size: Optional[int]
merge_size: Optional[int]
class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Glm4vImagesKwargs
_defaults = {
"text_kwargs": {
"padding": False,
@ -53,7 +42,6 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
},
"videos_kwargs": {"return_metadata": True},
}
videos_kwargs: Glm4vVideosProcessorKwargs
class Glm4vProcessor(ProcessorMixin):

View File

@ -37,12 +37,11 @@ from .image_processing_glm4v import smart_resize
class Glm4vVideoProcessorInitKwargs(VideosKwargs):
max_image_size: dict[str, int] = None
patch_size: Optional[int] = None
temporal_patch_size: Optional[int] = None
merge_size: Optional[int] = None
image_mean: Optional[list[float]] = None
image_std: Optional[list[float]] = None
max_image_size: Optional[dict[str, int]]
patch_size: Optional[int]
temporal_patch_size: Optional[int]
merge_size: Optional[int]
max_duration: Optional[int]
@add_start_docstrings(

View File

@ -38,6 +38,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
@ -48,6 +49,24 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class GotOcr2ImageProcessorKwargs(ImagesKwargs):
"""
crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
`preprocess` method.
min_patches (`int`, *optional*, defaults to 1):
The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
max_patches (`int`, *optional*, defaults to 12):
The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
"""
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
# Similar to image_processing_mllama.get_all_supported_aspect_ratios
@lru_cache(maxsize=10)
def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]:
@ -168,6 +187,7 @@ class GotOcr2ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = GotOcr2ImageProcessorKwargs
def __init__(
self,

View File

@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -32,25 +31,7 @@ from ...utils import (
TensorType,
auto_docstring,
)
from .image_processing_got_ocr2 import get_optimal_tiled_canvas
class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
`preprocess` method.
min_patches (`int`, *optional*, defaults to 1):
The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
max_patches (`int`, *optional*, defaults to 12):
The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
"""
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
from .image_processing_got_ocr2 import GotOcr2ImageProcessorKwargs, get_optimal_tiled_canvas
@auto_docstring
@ -66,13 +47,13 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
crop_to_patches = False
min_patches = 1
max_patches = 12
valid_kwargs = GotOcr2FastImageProcessorKwargs
valid_kwargs = GotOcr2ImageProcessorKwargs
def __init__(self, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def crop_image_to_patches(

View File

@ -18,11 +18,10 @@ from typing import Optional, Union
import numpy as np
from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import is_vision_available, logging
@ -37,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False):
class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
color: Optional[str]
num_image_tokens: Optional[int]
multi_page: Optional[bool]
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
@ -136,8 +135,6 @@ class GotOcr2Processor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[GotOcr2ProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -49,8 +49,6 @@ class GraniteSpeechProcessor(ProcessorMixin):
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
audio: Union["torch.Tensor", list["torch.Tensor"]] = None,
device: str = "cpu",
images=None,
videos=None,
**kwargs,
) -> BatchFeature:
requires_backends(self, ["torch"])

View File

@ -51,6 +51,7 @@ from ...image_utils import (
validate_kwargs,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
ExplicitEnum,
TensorType,
@ -91,6 +92,29 @@ class AnnotationFormat(ExplicitEnum):
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
class GroundingDinoImageProcessorKwargs(ImagesKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]]
# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
"""
@ -865,6 +889,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = GroundingDinoImageProcessorKwargs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
def __init__(

View File

@ -4,6 +4,26 @@
# the file from the modular. If any change should be done, please apply the change to the
# modular_grounding_dino.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
from typing import TYPE_CHECKING, Any, Optional, Union
@ -14,7 +34,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
SizeDict,
get_image_size_for_max_height_width,
get_max_height_width,
@ -35,7 +54,7 @@ from ...image_utils import (
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, logging
from ...utils.import_utils import requires
from .image_processing_grounding_dino import get_size_with_aspect_ratio
from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs, get_size_with_aspect_ratio
if TYPE_CHECKING:
@ -44,24 +63,6 @@ if TYPE_CHECKING:
logger = logging.get_logger(__name__)
class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
"""
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
return_segmentation_masks: Optional[bool]
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@ -304,9 +305,9 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
size = {"shortest_edge": 800, "longest_edge": 1333}
default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = GroundingDinoFastImageProcessorKwargs
valid_kwargs = GroundingDinoImageProcessorKwargs
def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None:
def __init__(self, **kwargs: Unpack[GroundingDinoImageProcessorKwargs]) -> None:
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@ -568,25 +569,8 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[GroundingDinoFastImageProcessorKwargs],
**kwargs: Unpack[GroundingDinoImageProcessorKwargs],
) -> BatchFeature:
r"""
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
List of annotations associated with the image or batch of images. If annotation is for object
detection, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
dictionary. An image can have no annotations, in which case the list should be empty.
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
An image can have no segments, in which case the list should be empty.
- "file_name" (`str`): The file name of the image.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
"""
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
logger.warning_once(
@ -601,7 +585,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
)
kwargs["size"] = kwargs.pop("max_size")
return super().preprocess(images, annotations, masks_path, **kwargs)
return super().preprocess(images, **kwargs)
def _preprocess(
self,

View File

@ -1,3 +1,23 @@
# coding=utf-8
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING, Optional, Union
import torch

View File

@ -16,13 +16,12 @@
Processor class for Grounding DINO.
"""
import pathlib
import warnings
from typing import TYPE_CHECKING, Optional, Union
from ...image_transforms import center_to_corners_format
from ...image_utils import AnnotationFormat, ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...utils import TensorType, is_torch_available
@ -99,16 +98,7 @@ class DictWithDeprecationWarning(dict):
return super().get(key, *args, **kwargs)
class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
return_segmentation_masks: Optional[bool]
masks_path: Optional[Union[str, pathlib.Path]]
do_convert_annotations: Optional[bool]
format: Optional[Union[str, AnnotationFormat]]
class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: GroundingDinoImagesKwargs
_defaults = {
"text_kwargs": {
"add_special_tokens": True,

View File

@ -28,6 +28,7 @@ from ...image_utils import (
to_numpy_array,
valid_images,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, is_torch_available
@ -35,6 +36,20 @@ IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
class IdeficsImageProcessorKwargs(ImagesKwargs):
"""
transform (`Callable`, *optional*):
A custom transform function that accepts a single image can be passed for training. For example,
`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
assumed - and then a preset of inference-specific transforms will be applied to the images
image_size (`dict[str, int]`, *optional*):
Resize to image size
"""
transform: Optional[Callable]
image_size: Optional[dict[str, int]]
def convert_to_rgb(image):
# `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
# for transparent images. The call to `alpha_composite` handles this case
@ -74,6 +89,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = IdeficsImageProcessorKwargs
def __init__(
self,

View File

@ -16,13 +16,12 @@
Processor class for IDEFICS.
"""
from typing import Callable, Optional, Union
from typing import Optional, Union
from urllib.parse import urlparse
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import (
ImagesKwargs,
ProcessingKwargs,
ProcessorMixin,
TextKwargs,
@ -40,13 +39,6 @@ if is_torch_available():
IMAGE_TOKEN = "<image>"
class IdeficsImagesKwargs(ImagesKwargs, total=False):
transform: Optional[Callable]
image_size: Optional[dict[str, int]]
image_mean: Optional[Union[float, list[float]]]
image_std: Optional[Union[float, list[float]]]
class IdeficsTextKwargs(TextKwargs, total=False):
add_eos_token: Optional[bool]
add_end_of_utterance_token: Optional[bool]
@ -54,14 +46,12 @@ class IdeficsTextKwargs(TextKwargs, total=False):
class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
text_kwargs: IdeficsTextKwargs
images_kwargs: IdeficsImagesKwargs
_defaults = {
"text_kwargs": {
"add_special_tokens": False,
"padding": "longest",
"add_eos_token": False,
},
"images_kwargs": {},
"common_kwargs": {"return_tensors": "pt"},
}
@ -198,8 +188,6 @@ class IdeficsProcessor(ProcessorMixin):
list[list[TextInput]],
list[list[PreTokenizedInput]],
] = None,
audio=None,
videos=None,
**kwargs: Unpack[IdeficsProcessorKwargs],
) -> BatchFeature:
"""This method takes batched or non-batched prompts made of text and images and converts them into prompts that

View File

@ -35,6 +35,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, is_vision_available, logging
@ -46,6 +47,15 @@ if is_vision_available():
from PIL import Image
class Idefics2ImageProcessorKwargs(ImagesKwargs):
"""
do_image_splitting (`bool`, *optional*, defaults to `False`):
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
"""
do_image_splitting: Optional[bool]
def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
"""
Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
@ -186,6 +196,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "pixel_attention_mask"]
valid_kwargs = Idefics2ImageProcessorKwargs
def __init__(
self,

View File

@ -21,7 +21,6 @@ import torch
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
SizeDict,
group_images_by_shape,
reorder_images,
@ -35,7 +34,7 @@ from ...image_utils import (
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
from .image_processing_idefics2 import convert_to_rgb
from .image_processing_idefics2 import Idefics2ImageProcessorKwargs, convert_to_rgb
if is_torchvision_available():
@ -105,15 +104,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
return mask
class Idefics2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
do_image_splitting (`bool`, *optional*, defaults to `False`):
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
"""
do_image_splitting: Optional[bool]
@auto_docstring
class Idefics2ImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
@ -127,7 +117,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast):
do_image_splitting = False
size = {"shortest_edge": 378, "longest_edge": 980}
model_input_names = ["pixel_values", "pixel_attention_mask"]
valid_kwargs = Idefics2FastImageProcessorKwargs
valid_kwargs = Idefics2ImageProcessorKwargs
def convert_to_rgb(self, image: ImageInput) -> ImageInput:
"""
@ -214,7 +204,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast):
return image, pixel_mask
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2FastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2ImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def _preprocess(

View File

@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, is_valid_image, load_image
from ...processing_utils import (
ImagesKwargs,
ProcessingKwargs,
ProcessorMixin,
Unpack,
@ -46,20 +45,13 @@ def is_image_or_image_url(elem):
return is_url(elem) or is_valid_image(elem)
class Idefics2ImagesKwargs(ImagesKwargs, total=False):
image_seq_len: Optional[int]
class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Idefics2ImagesKwargs
_defaults = {
"text_kwargs": {
"add_special_tokens": True,
"padding": False,
"is_split_into_words": False,
},
"images_kwargs": {},
}
@ -123,8 +115,6 @@ class Idefics2Processor(ProcessorMixin):
self,
images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Idefics2ProcessorKwargs],
) -> BatchFeature:
"""
@ -181,8 +171,6 @@ class Idefics2Processor(ProcessorMixin):
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
n_images_in_text = []
@ -197,12 +185,11 @@ class Idefics2Processor(ProcessorMixin):
# Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
fake_image_token = self.fake_image_token
image_token = self.image_token
image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
image_str = f"{fake_image_token}{image_token * self.image_seq_len}{fake_image_token}"
if self.image_processor.do_image_splitting:
# A single image token is split into 4 patches + 1 original image
image_str = image_str * 5
image_seq_len *= 5
prompt_strings = []
for sample in text:

View File

@ -35,6 +35,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, is_vision_available, logging
@ -47,6 +48,22 @@ if is_vision_available():
from PIL import Image
class Idefics3ImageProcessorKwargs(ImagesKwargs):
"""
do_image_splitting (`bool`, *optional*, defaults to `True`):
Whether to split the image into sub-images concatenated with the original image. They are split into patches
such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
return_row_col_info (`bool`, *optional*, defaults to `False`):
Whether to return the row and column information of the images.
"""
do_image_splitting: Optional[bool]
max_image_size: Optional[dict[str, int]]
return_row_col_info: Optional[bool]
def _resize_output_size_rescale_to_max_len(
height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
) -> tuple[int, int]:
@ -291,6 +308,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values", "pixel_attention_mask"]
valid_kwargs = Idefics3ImageProcessorKwargs
def __init__(
self,

View File

@ -22,7 +22,6 @@ import torch
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
SizeDict,
group_images_by_shape,
reorder_images,
@ -36,6 +35,7 @@ from ...image_utils import (
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
from .image_processing_idefics3 import Idefics3ImageProcessorKwargs
if is_torchvision_available():
@ -169,22 +169,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
return mask
class Idefics3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
do_image_splitting (`bool`, *optional*, defaults to `True`):
Whether to split the image into sub-images concatenated with the original image. They are split into patches
such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
return_row_col_info (`bool`, *optional*, defaults to `False`):
Whether to return the row and column information of the images.
"""
do_image_splitting: Optional[bool]
max_image_size: Optional[dict[str, int]]
return_row_col_info: Optional[bool]
@auto_docstring
class Idefics3ImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.LANCZOS
@ -199,7 +183,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
do_image_splitting = True
do_pad = True
return_row_col_info = False
valid_kwargs = Idefics3FastImageProcessorKwargs
valid_kwargs = Idefics3ImageProcessorKwargs
def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
"""
@ -367,7 +351,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
return image, pixel_mask
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3FastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3ImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def _preprocess(

View File

@ -24,7 +24,7 @@ import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, is_valid_image, load_image
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
from ...utils import logging
@ -87,14 +87,7 @@ def get_image_prompt_string(
)
class Idefics3ImagesKwargs(ImagesKwargs, total=False):
return_row_col_info: Optional[bool]
max_image_size: Optional[dict[str, int]]
class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Idefics3ImagesKwargs
_defaults = {
"text_kwargs": {
"add_special_tokens": True,
@ -179,8 +172,6 @@ class Idefics3Processor(ProcessorMixin):
self,
images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
audio=None,
videos=None,
image_seq_len: Optional[int] = None,
**kwargs: Unpack[Idefics3ProcessorKwargs],
) -> BatchEncoding:

View File

@ -31,17 +31,34 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, is_vision_available, logging
from ...utils.import_utils import requires
if is_vision_available():
import PIL
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
class ImageGPTImageProcessorKwargs(ImagesKwargs):
"""
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
in `preprocess`.
do_color_quantize (`bool`, *optional*, defaults to `True`):
Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
"""
clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
do_color_quantize: Optional[bool]
def squared_euclidean_distance(a, b):
b = b.T
a2 = np.sum(np.square(a), axis=1)
@ -83,6 +100,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = ImageGPTImageProcessorKwargs
def __init__(
self,

View File

@ -23,7 +23,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
)
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import PILImageResampling
@ -32,6 +31,7 @@ from ...utils import (
TensorType,
auto_docstring,
)
from .image_processing_imagegpt import ImageGPTImageProcessorKwargs
def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
@ -68,20 +68,6 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso
return torch.argmin(d, dim=1)
class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
in `preprocess`.
do_color_quantize (`bool`, *optional*, defaults to `True`):
Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
"""
clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]]
do_color_quantize: Optional[bool]
@auto_docstring
class ImageGPTImageProcessorFast(BaseImageProcessorFast):
model_input_names = ["input_ids"]
@ -92,12 +78,12 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
image_std = [0.5, 0.5, 0.5]
do_rescale = True
do_normalize = True
valid_kwargs = ImageGPTFastImageProcessorKwargs
valid_kwargs = ImageGPTImageProcessorKwargs
def __init__(
self,
clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, # keep as arg for backwards compatibility
**kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
**kwargs: Unpack[ImageGPTImageProcessorKwargs],
):
r"""
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):

View File

@ -43,7 +43,6 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False):
"return_length": False,
"verbose": True,
},
"images_kwargs": {},
}
@ -85,8 +84,6 @@ class InstructBlipProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[InstructBlipProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -19,19 +19,12 @@ import numpy as np
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...video_utils import VideoInput
class InternVLImagesKwargs(ImagesKwargs, total=False):
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
class InternVLProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: InternVLImagesKwargs
_defaults = {
"text_kwargs": {
"padding_side": "left",
@ -159,7 +152,6 @@ class InternVLProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audio=None,
videos: Optional[VideoInput] = None,
**kwargs: Unpack[InternVLProcessorKwargs],
) -> BatchFeature:

View File

@ -43,7 +43,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
initial_shift = True
do_sample_frames = False # Set to False for BC, recommended to set `True` in new models
valid_kwargs = InternVLVideoProcessorInitKwargs
model_input_names = ["pixel_values_videos"]
def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]):
super().__init__(**kwargs)

View File

@ -40,6 +40,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
@ -50,6 +51,16 @@ if is_vision_available():
logger = logging.get_logger(__name__)
class JanusImageProcessorKwargs(ImagesKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""
min_size: int
class JanusImageProcessor(BaseImageProcessor):
r"""
Constructs a JANUS image processor.
@ -92,6 +103,8 @@ class JanusImageProcessor(BaseImageProcessor):
model_input_names = ["pixel_values"]
valid_kwargs = JanusImageProcessorKwargs
def __init__(
self,
do_resize: bool = True,

View File

@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -38,16 +37,7 @@ from ...utils import (
TensorType,
auto_docstring,
)
class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""
min_size: int
from .image_processing_janus import JanusImageProcessorKwargs
@auto_docstring
@ -61,9 +51,9 @@ class JanusImageProcessorFast(BaseImageProcessorFast):
do_rescale = True
do_normalize = True
do_pad = True
valid_kwargs = JanusFastImageProcessorKwargs
valid_kwargs = JanusImageProcessorKwargs
def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[JanusImageProcessorKwargs]):
if kwargs.get("image_mean") is None:
background_color = (127, 127, 127)
else:

View File

@ -47,7 +47,7 @@ from ...image_utils import (
)
from ...modeling_outputs import ModelOutput
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import (
TensorType,
TransformersKwargs,
@ -1289,6 +1289,16 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
return generated_tokens
class JanusImageProcessorKwargs(ImagesKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""
min_size: int
class JanusImageProcessor(BlipImageProcessor):
r"""
Constructs a JANUS image processor.
@ -1329,6 +1339,8 @@ class JanusImageProcessor(BlipImageProcessor):
Whether to pad the image to square or not.
"""
valid_kwargs = JanusImageProcessorKwargs
def __init__(
self,
do_resize: bool = True,

View File

@ -81,8 +81,6 @@ class JanusProcessor(ProcessorMixin):
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
images: Optional[ImageInput] = None,
videos=None,
audio=None,
**kwargs: Unpack[JanusProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -136,8 +136,6 @@ class Kosmos2Processor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, list[TextInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Kosmos2ProcessorKwargs],
) -> BatchFeature:
"""

View File

@ -34,6 +34,7 @@ from ...image_utils import (
to_numpy_array,
valid_images,
)
from ...processing_utils import ImagesKwargs
from ...utils import TensorType, is_torch_available, logging
from ...utils.import_utils import requires_backends
@ -45,6 +46,19 @@ logger = logging.get_logger(__name__)
DEFAULT_FONT_PATH = "ybelkada/fonts"
class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
r"""
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
max_patches (`int`, *optional*, defaults to 4096):
The maximum number of patches to extract from the image as per the
[KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
"""
patch_size: Optional[dict[str, int]]
max_patches: Optional[int]
# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
def torch_extract_patches(image_tensor, patch_height, patch_width):
"""
@ -92,6 +106,7 @@ class Kosmos2_5ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["flattened_patches"]
valid_kwargs = Kosmos2_5ImageProcessorKwargs
def __init__(
self,

View File

@ -22,13 +22,13 @@ import torch
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
from ...image_utils import ChannelDimension, ImageInput, get_image_size
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring
from .image_processing_kosmos2_5 import Kosmos2_5ImageProcessorKwargs
# Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
@ -56,19 +56,6 @@ def torch_extract_patches(image_tensor, patch_height, patch_width):
return patches
class Kosmos2_5FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
max_patches (`int`, *optional*, defaults to 4096):
The maximum number of patches to extract from the image as per the
[KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
"""
patch_size: Optional[dict[str, int]]
max_patches: Optional[int]
@auto_docstring
class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
# To be checked against the slow image processor
@ -78,13 +65,13 @@ class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
patch_size = {"height": 16, "width": 16}
max_patches = 4096
rescale_factor = None
valid_kwargs = Kosmos2_5FastImageProcessorKwargs
valid_kwargs = Kosmos2_5ImageProcessorKwargs
def __init__(self, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]) -> BatchFeature:
r"""
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.

View File

@ -20,7 +20,7 @@ from typing import Optional, Union
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import TextInput
from ...utils import is_torch_available
@ -29,14 +29,7 @@ if is_torch_available():
import torch
class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
max_patches: Optional[int]
num_image_tokens: Optional[int]
class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
text_kwargs: TextKwargs
images_kwargs: Kosmos2_5ImagesKwargs
_defaults = {
"text_kwargs": {
"padding": True,
@ -46,7 +39,6 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
},
"images_kwargs": {
"max_patches": 4096,
"num_image_tokens": 2048,
},
"common_kwargs": {"return_tensors": "pt"},
}
@ -65,24 +57,25 @@ class Kosmos2_5Processor(ProcessorMixin):
An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
num_image_tokens (`int`, *optional*, defaults to 2048):
Number of image tokens used as a placeholder.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "PreTrainedTokenizerFast"
def __init__(self, image_processor, tokenizer):
def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
self.image_start_token = tokenizer.boi_token # "<image>" : fixed token for the start of image
self.image_end_token = tokenizer.eoi_token # "</image>" : fixed token for the end of image
self.image_token = tokenizer.image_token # "<s>" : within a <image> ... </image> pair, these <s> tokens indicate they are positions reserved for an image
self.num_image_tokens = num_image_tokens
super().__init__(image_processor, tokenizer)
def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, list[TextInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Kosmos2_5ProcessorKwargs],
) -> BatchFeature:
"""
@ -104,8 +97,6 @@ class Kosmos2_5Processor(ProcessorMixin):
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)
encoding = BatchFeature()
if images is not None:
@ -114,7 +105,7 @@ class Kosmos2_5Processor(ProcessorMixin):
image_encoding.pop("cols")
encoding.update(image_encoding)
prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * num_image_tokens}{self.image_end_token}"
prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * self.num_image_tokens}{self.image_end_token}"
if text is not None:
if isinstance(text, str):
@ -124,7 +115,7 @@ class Kosmos2_5Processor(ProcessorMixin):
input = self.tokenizer(text, **output_kwargs["text_kwargs"])
batch_size, seq_len = input.input_ids.shape
image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1]
image_embeds_position_mask = [0, -1] + [1] * self.num_image_tokens + [-1]
image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask))
image_embeds_position_mask = (
torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)

View File

@ -30,6 +30,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
filter_out_non_signature_kwargs,
@ -51,6 +52,25 @@ if is_pytesseract_available():
logger = logging.get_logger(__name__)
class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
r"""
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
the `apply_ocr` parameter in the `preprocess` method.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
`preprocess` method.
"""
apply_ocr: Optional[bool]
ocr_lang: Optional[str]
tesseract_config: Optional[str]
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
@ -125,6 +145,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = LayoutLMv2ImageProcessorKwargs
def __init__(
self,

View File

@ -19,7 +19,7 @@ from typing import Optional, Union
import torch
from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
from ...image_utils import ImageInput, PILImageResampling, SizeDict
from ...processing_utils import Unpack
@ -29,32 +29,12 @@ from ...utils import (
logging,
requires_backends,
)
from .image_processing_layoutlmv2 import apply_tesseract
from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessorKwargs, apply_tesseract
logger = logging.get_logger(__name__)
class LayoutLMv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
Args:
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
the `apply_ocr` parameter in the `preprocess` method.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
`preprocess` method.
"""
apply_ocr: Optional[bool]
ocr_lang: Optional[str]
tesseract_config: Optional[str]
@auto_docstring
class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
@ -64,13 +44,13 @@ class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
apply_ocr = True
ocr_lang = None
tesseract_config = ""
valid_kwargs = LayoutLMv2FastImageProcessorKwargs
valid_kwargs = LayoutLMv2ImageProcessorKwargs
def __init__(self, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def _preprocess(

View File

@ -34,6 +34,7 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
filter_out_non_signature_kwargs,
@ -55,6 +56,25 @@ if is_pytesseract_available():
logger = logging.get_logger(__name__)
class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
r"""
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
the `apply_ocr` parameter in the `preprocess` method.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
`preprocess` method.
"""
apply_ocr: Optional[bool]
ocr_lang: Optional[str]
tesseract_config: Optional[str]
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
@ -143,6 +163,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
"""
model_input_names = ["pixel_values"]
valid_kwargs = LayoutLMv3ImageProcessorKwargs
def __init__(
self,

View File

@ -19,7 +19,7 @@ from typing import Optional, Union
import torch
from torchvision.transforms.v2 import functional as F
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
from ...processing_utils import Unpack
@ -29,32 +29,12 @@ from ...utils import (
logging,
requires_backends,
)
from .image_processing_layoutlmv3 import apply_tesseract
from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessorKwargs, apply_tesseract
logger = logging.get_logger(__name__)
class LayoutLMv3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
Args:
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
the `apply_ocr` parameter in the `preprocess` method.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
`preprocess` method.
"""
apply_ocr: Optional[bool]
ocr_lang: Optional[str]
tesseract_config: Optional[str]
@auto_docstring
class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
@ -67,13 +47,13 @@ class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
apply_ocr = True
ocr_lang = None
tesseract_config = ""
valid_kwargs = LayoutLMv3FastImageProcessorKwargs
valid_kwargs = LayoutLMv3ImageProcessorKwargs
def __init__(self, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]) -> BatchFeature:
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def _preprocess(

View File

@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@ -33,9 +32,7 @@ from ...image_utils import (
PILImageResampling,
SizeDict,
)
from ...processing_utils import (
Unpack,
)
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import (
TensorType,
auto_docstring,
@ -172,7 +169,7 @@ def pad_along_first_dim(
return images, pixel_mask
class Lfm2VlFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
class Lfm2VlImageProcessorKwargs(ImagesKwargs):
"""
downsample_factor (`int`, *optional*, defaults to `2`):
The downsampling factor for images used when resizing the image.
@ -214,10 +211,10 @@ class Lfm2VlImageProcessorFast(BaseImageProcessorFast):
return_row_col_info = False
image_mean = IMAGENET_STANDARD_STD
image_std = IMAGENET_STANDARD_MEAN
valid_kwargs = Lfm2VlFastImageProcessorKwargs
valid_kwargs = Lfm2VlImageProcessorKwargs
model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
def __init__(self, **kwargs: Unpack[Lfm2VlFastImageProcessorKwargs]):
def __init__(self, **kwargs: Unpack[Lfm2VlImageProcessorKwargs]):
super().__init__(**kwargs)
max_thumbnail_image_patches = self.max_image_tokens * self.downsample_factor**2

View File

@ -25,12 +25,11 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
from ...image_utils import ImageInput, PILImageResampling, SizeDict
from ...processing_utils import Unpack
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import (
TensorType,
auto_docstring,
@ -309,8 +308,8 @@ def get_best_fit(
return optimal_canvas
class Llama4ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
class Llama4ImageProcessorKwargs(ImagesKwargs):
r"""
max_patches (`int`, *optional*, defaults to 16):
The maximum number of patches to be extracted from the image.
Can be overridden by the `max_patches` parameter in the `preprocess` method.

View File

@ -16,20 +16,14 @@
from typing import Optional, Union
from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput, make_flat_list_of_images
class Llama4ImagesKwargs(ImagesKwargs, total=False):
max_patches: Optional[int]
resize_to_max_canvas: Optional[bool]
class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Llama4ImagesKwargs
_defaults = {
"text_kwargs": {
"padding_side": "left",
@ -139,8 +133,6 @@ class Llama4Processor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Llama4ProcessorKwargs],
) -> BatchFeature:
"""

Some files were not shown because too many files have changed in this diff Show More