mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
🚨 [unbloating] unify TypedDict
usage in processing (#40931)
* just squash commits into one * fix style
This commit is contained in:
committed by
GitHub
parent
42bcc81ba2
commit
5339f72b9b
@ -292,7 +292,7 @@ The `@auto_docstring` decorator automatically generates docstrings by:
|
||||
|
||||
8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
|
||||
|
||||
Currently only supported for [`FastImageProcessorKwargs`].
|
||||
Currently only supported for [`ImagesKwargs`].
|
||||
|
||||
## Best practices
|
||||
|
||||
|
@ -20,7 +20,8 @@ import numpy as np
|
||||
|
||||
from .image_processing_base import BatchFeature, ImageProcessingMixin
|
||||
from .image_transforms import center_crop, normalize, rescale
|
||||
from .image_utils import ChannelDimension, get_image_size
|
||||
from .image_utils import ChannelDimension, ImageInput, get_image_size
|
||||
from .processing_utils import ImagesKwargs, Unpack
|
||||
from .utils import logging
|
||||
from .utils.import_utils import requires
|
||||
|
||||
@ -36,6 +37,8 @@ INIT_SERVICE_KWARGS = [
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class BaseImageProcessor(ImageProcessingMixin):
|
||||
valid_kwargs = ImagesKwargs
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@ -46,9 +49,9 @@ class BaseImageProcessor(ImageProcessingMixin):
|
||||
"""
|
||||
return False
|
||||
|
||||
def __call__(self, images, **kwargs) -> BatchFeature:
|
||||
def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
|
||||
"""Preprocess an image or a batch of images."""
|
||||
return self.preprocess(images, **kwargs)
|
||||
return self.preprocess(images, *args, **kwargs)
|
||||
|
||||
def preprocess(self, images, **kwargs) -> BatchFeature:
|
||||
raise NotImplementedError("Each image processor must implement its own preprocess method")
|
||||
|
@ -15,7 +15,7 @@
|
||||
from collections.abc import Iterable
|
||||
from copy import deepcopy
|
||||
from functools import lru_cache, partial
|
||||
from typing import Any, Optional, TypedDict, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
@ -40,7 +40,7 @@ from .image_utils import (
|
||||
validate_kwargs,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from .processing_utils import Unpack
|
||||
from .processing_utils import ImagesKwargs, Unpack
|
||||
from .utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
@ -163,28 +163,6 @@ def divide_to_patches(
|
||||
return patches
|
||||
|
||||
|
||||
class DefaultFastImageProcessorKwargs(TypedDict, total=False):
|
||||
do_resize: Optional[bool]
|
||||
size: Optional[dict[str, int]]
|
||||
default_to_square: Optional[bool]
|
||||
resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
|
||||
do_center_crop: Optional[bool]
|
||||
crop_size: Optional[dict[str, int]]
|
||||
do_rescale: Optional[bool]
|
||||
rescale_factor: Optional[Union[int, float]]
|
||||
do_normalize: Optional[bool]
|
||||
image_mean: Optional[Union[float, list[float]]]
|
||||
image_std: Optional[Union[float, list[float]]]
|
||||
do_pad: Optional[bool]
|
||||
pad_size: Optional[dict[str, int]]
|
||||
do_convert_rgb: Optional[bool]
|
||||
return_tensors: Optional[Union[str, TensorType]]
|
||||
data_format: Optional[ChannelDimension]
|
||||
input_data_format: Optional[Union[str, ChannelDimension]]
|
||||
device: Optional["torch.device"]
|
||||
disable_grouping: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class BaseImageProcessorFast(BaseImageProcessor):
|
||||
resample = None
|
||||
@ -206,10 +184,10 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
input_data_format = None
|
||||
device = None
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = DefaultFastImageProcessorKwargs
|
||||
valid_kwargs = ImagesKwargs
|
||||
unused_kwargs = None
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[ImagesKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
kwargs = self.filter_out_unused_kwargs(kwargs)
|
||||
size = kwargs.pop("size", self.size)
|
||||
@ -728,11 +706,8 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
data_format=data_format,
|
||||
)
|
||||
|
||||
def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
|
||||
return self.preprocess(images, *args, **kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
|
||||
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
|
||||
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
|
||||
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
||||
@ -765,7 +740,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
do_convert_rgb: bool,
|
||||
input_data_format: ChannelDimension,
|
||||
device: Optional[Union[str, "torch.device"]] = None,
|
||||
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[ImagesKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Preprocess image-like inputs.
|
||||
|
@ -959,8 +959,6 @@ class AriaProcessor(ProcessorMixin):
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
|
||||
images: Optional[ImageInput] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[AriaProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -85,8 +85,6 @@ class AriaProcessor(ProcessorMixin):
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
|
||||
images: Optional[ImageInput] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[AriaProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -19,18 +19,11 @@ import numpy as np
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, make_flat_list_of_images
|
||||
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
|
||||
class AyaVisionImagesKwargs(ImagesKwargs, total=False):
|
||||
crop_to_patches: Optional[bool]
|
||||
min_patches: Optional[int]
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: AyaVisionImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding_side": "left",
|
||||
@ -140,8 +133,6 @@ class AyaVisionProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[AyaVisionProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -33,6 +33,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
filter_out_non_signature_kwargs,
|
||||
@ -54,6 +55,17 @@ if is_torch_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class BeitImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||
is used for background, and background itself is not included in all classes of a dataset (e.g.
|
||||
ADE20k). The background label will be replaced by 255.
|
||||
"""
|
||||
|
||||
do_reduce_labels: Optional[bool]
|
||||
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class BeitImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
@ -99,6 +111,7 @@ class BeitImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = BeitImageProcessorKwargs
|
||||
|
||||
@filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS)
|
||||
def __init__(
|
||||
|
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -40,17 +39,7 @@ from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
)
|
||||
|
||||
|
||||
class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||
is used for background, and background itself is not included in all classes of a dataset (e.g.
|
||||
ADE20k). The background label will be replaced by 255.
|
||||
"""
|
||||
|
||||
do_reduce_labels: Optional[bool]
|
||||
from .image_processing_beit import BeitImageProcessorKwargs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -66,9 +55,9 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_reduce_labels = False
|
||||
valid_kwargs = BeitFastImageProcessorKwargs
|
||||
valid_kwargs = BeitImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[BeitImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def reduce_label(self, labels: list["torch.Tensor"]):
|
||||
@ -86,7 +75,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
|
||||
self,
|
||||
images: ImageInput,
|
||||
segmentation_maps: Optional[ImageInput] = None,
|
||||
**kwargs: Unpack[BeitFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[BeitImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
segmentation_maps (`ImageInput`, *optional*):
|
||||
@ -101,7 +90,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
|
||||
do_convert_rgb: bool,
|
||||
input_data_format: ChannelDimension,
|
||||
device: Optional[Union[str, "torch.device"]] = None,
|
||||
**kwargs: Unpack[BeitFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[BeitImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Preprocess image-like inputs.
|
||||
|
@ -36,7 +36,6 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False):
|
||||
"return_length": False,
|
||||
"verbose": True,
|
||||
},
|
||||
"images_kwargs": {},
|
||||
}
|
||||
|
||||
|
||||
@ -67,8 +66,6 @@ class BlipProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[BlipProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
|
@ -41,7 +41,6 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
"return_length": False,
|
||||
"verbose": True,
|
||||
},
|
||||
"images_kwargs": {},
|
||||
}
|
||||
|
||||
|
||||
@ -81,8 +80,6 @@ class Blip2Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Blip2ProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
|
@ -35,6 +35,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
|
||||
|
||||
@ -122,6 +123,10 @@ def get_resize_output_image_size(
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
class BridgeTowerImageProcessorKwargs(ImagesKwargs):
|
||||
size_divisor: Optional[int]
|
||||
|
||||
|
||||
class BridgeTowerImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a BridgeTower image processor.
|
||||
@ -169,6 +174,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = BridgeTowerImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -23,7 +23,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
ImageInput,
|
||||
SizeDict,
|
||||
TensorType,
|
||||
@ -33,6 +32,7 @@ from ...image_processing_utils_fast import (
|
||||
)
|
||||
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
|
||||
from ...utils import auto_docstring
|
||||
from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs
|
||||
|
||||
|
||||
def make_pixel_mask(
|
||||
@ -85,17 +85,6 @@ def get_resize_output_image_size(
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
Args:
|
||||
size_divisor (`int`, *optional*, defaults to 32):
|
||||
The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
|
||||
is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
size_divisor: Optional[int]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
@ -110,14 +99,14 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
do_normalize = True
|
||||
do_pad = True
|
||||
size_divisor = 32
|
||||
valid_kwargs = BridgeTowerFastImageProcessorKwargs
|
||||
valid_kwargs = BridgeTowerImageProcessorKwargs
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def resize(
|
||||
|
@ -16,17 +16,10 @@
|
||||
Processor class for BridgeTower.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class BridgeTowerImagesKwargs(ImagesKwargs):
|
||||
size_divisor: Optional[int]
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: BridgeTowerImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"add_special_tokens": True,
|
||||
|
@ -92,8 +92,6 @@ class ChameleonProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[ChameleonProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -27,18 +27,13 @@ import torch
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
|
||||
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...utils import TensorType, auto_docstring
|
||||
|
||||
|
||||
class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||
|
@ -30,8 +30,10 @@ from transformers.models.aya_vision.modeling_aya_vision import (
|
||||
from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
|
||||
|
||||
from ...cache_utils import Cache
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...processing_utils import Unpack
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...utils import TransformersKwargs, auto_docstring, logging
|
||||
from ...utils.generic import check_model_inputs
|
||||
from .configuration_cohere2_vision import Cohere2VisionConfig
|
||||
@ -301,6 +303,24 @@ def get_optimal_tiled_canvas(
|
||||
return best_grid
|
||||
|
||||
|
||||
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||
`preprocess` method.
|
||||
min_patches (`int`, *optional*, defaults to 1):
|
||||
The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
|
||||
set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
|
||||
max_patches (`int`, *optional*, defaults to 12):
|
||||
The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
|
||||
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
crop_to_patches: Optional[bool]
|
||||
min_patches: Optional[int]
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
|
||||
size = {"height": 512, "width": 512}
|
||||
@ -308,6 +328,14 @@ class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
|
||||
max_patches = 12
|
||||
crop_to_patches = True
|
||||
patch_size = 16
|
||||
valid_kwargs = Cohere2VisionFastImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
|
||||
__all__ = [
|
||||
|
@ -19,16 +19,11 @@ import numpy as np
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
|
||||
class Cohere2VisionImagesKwargs(ImagesKwargs, total=False):
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: Cohere2VisionImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding_side": "left",
|
||||
|
@ -90,8 +90,6 @@ class ColPaliProcessor(PaliGemmaProcessor):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[ColPaliProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -131,8 +131,6 @@ class ColPaliProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[ColPaliProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -93,8 +93,6 @@ class ColQwen2Processor(ColPaliProcessor):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[ColQwen2ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -94,8 +94,6 @@ class ColQwen2Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[ColQwen2ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -53,6 +53,7 @@ from ...image_utils import (
|
||||
validate_kwargs,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
is_scipy_available,
|
||||
@ -774,6 +775,29 @@ def compute_segments(
|
||||
return segmentation, segments
|
||||
|
||||
|
||||
class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||
masks_path: Optional[Union[str, pathlib.Path]]
|
||||
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
@ -829,6 +853,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = ConditionalDetrImageProcessorKwargs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
|
||||
def __init__(
|
||||
|
@ -15,7 +15,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature, get_size_dict
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
SizeDict,
|
||||
get_image_size_for_max_height_width,
|
||||
get_max_height_width,
|
||||
@ -37,6 +36,7 @@ from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring, logging
|
||||
from ...utils.import_utils import requires
|
||||
from .image_processing_conditional_detr import (
|
||||
ConditionalDetrImageProcessorKwargs,
|
||||
compute_segments,
|
||||
convert_segmentation_to_rle,
|
||||
get_size_with_aspect_ratio,
|
||||
@ -46,24 +46,6 @@ from .image_processing_conditional_detr import (
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ConditionalDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
|
||||
|
||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||
|
||||
|
||||
@ -278,9 +260,9 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
|
||||
size = {"shortest_edge": 800, "longest_edge": 1333}
|
||||
default_to_square = False
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = ConditionalDetrFastImageProcessorKwargs
|
||||
valid_kwargs = ConditionalDetrImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs]) -> None:
|
||||
def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
|
||||
@ -542,25 +524,8 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
|
||||
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||
**kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[ConditionalDetrImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
List of annotations associated with the image or batch of images. If annotation is for object
|
||||
detection, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
|
||||
dictionary. An image can have no annotations, in which case the list should be empty.
|
||||
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
|
||||
An image can have no segments, in which case the list should be empty.
|
||||
- "file_name" (`str`): The file name of the image.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
logger.warning_once(
|
||||
@ -575,7 +540,7 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
|
||||
)
|
||||
kwargs["size"] = kwargs.pop("max_size")
|
||||
|
||||
return super().preprocess(images, annotations, masks_path, **kwargs)
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
|
@ -38,6 +38,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
from ...utils.import_utils import requires
|
||||
|
||||
@ -49,6 +50,16 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ConvNextImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
crop_pct (`float`, *optional*):
|
||||
Percentage of the image to crop. Only has an effect if size < 384. Can be
|
||||
overridden by `crop_pct` in the`preprocess` method.
|
||||
"""
|
||||
|
||||
crop_pct: Optional[float]
|
||||
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class ConvNextImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
@ -87,6 +98,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = ConvNextImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -39,16 +38,7 @@ from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
)
|
||||
|
||||
|
||||
class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
crop_pct (`float`, *optional*):
|
||||
Percentage of the image to crop. Only has an effect if size < 384. Can be
|
||||
overridden by `crop_pct` in the`preprocess` method.
|
||||
"""
|
||||
|
||||
crop_pct: Optional[float]
|
||||
from .image_processing_convnext import ConvNextImageProcessorKwargs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -62,13 +52,13 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
crop_pct = 224 / 256
|
||||
valid_kwargs = ConvNextFastImageProcessorKwargs
|
||||
valid_kwargs = ConvNextImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[ConvNextImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def resize(
|
||||
|
@ -246,9 +246,7 @@ class CsmProcessor(ProcessorMixin):
|
||||
|
||||
text_kwargs = output_kwargs["text_kwargs"]
|
||||
audio_kwargs = output_kwargs["audio_kwargs"]
|
||||
common_kwargs = output_kwargs["common_kwargs"]
|
||||
|
||||
return_tensors = common_kwargs.pop("return_tensors", None)
|
||||
return_tensors = text_kwargs.get("return_tensors", None)
|
||||
if return_tensors != "pt":
|
||||
raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
|
||||
|
||||
|
@ -38,6 +38,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
|
||||
|
||||
@ -48,6 +49,16 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeepseekVLImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
|
||||
|
||||
class DeepseekVLImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a DEEPSEEK_VL image processor.
|
||||
@ -90,6 +101,8 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
valid_kwargs = DeepseekVLImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
|
@ -24,25 +24,11 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
|
||||
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring
|
||||
|
||||
|
||||
class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -56,9 +42,9 @@ class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_pad = True
|
||||
valid_kwargs = DeepseekVLFastImageProcessorKwargs
|
||||
valid_kwargs = DeepseekVLImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
if kwargs.get("image_mean") is None:
|
||||
background_color = (127, 127, 127)
|
||||
|
@ -39,6 +39,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
|
||||
|
||||
@ -49,6 +50,32 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
|
||||
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
|
||||
method.
|
||||
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
|
||||
overridden by the `high_res_resample` parameter in the `preprocess` method.
|
||||
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
|
||||
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
|
||||
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
|
||||
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
high_res_size: dict
|
||||
high_res_resample: "PILImageResampling"
|
||||
high_res_image_mean: list[float]
|
||||
high_res_image_std: list[float]
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a DEEPSEEK_VL_HYBRID image processor.
|
||||
@ -102,6 +129,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "high_res_pixel_values"]
|
||||
valid_kwargs = DeepseekVLHybridImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -26,7 +26,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
get_size_dict,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
@ -41,32 +40,7 @@ from ...image_utils import (
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring
|
||||
|
||||
|
||||
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
|
||||
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
|
||||
method.
|
||||
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
|
||||
overridden by the `high_res_resample` parameter in the `preprocess` method.
|
||||
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
|
||||
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
|
||||
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
|
||||
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
high_res_size: dict
|
||||
high_res_resample: "PILImageResampling"
|
||||
high_res_image_mean: list[float]
|
||||
high_res_image_std: list[float]
|
||||
from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -80,14 +54,14 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_pad = True
|
||||
valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs
|
||||
valid_kwargs = DeepseekVLHybridImageProcessorKwargs
|
||||
high_res_image_mean = OPENAI_CLIP_MEAN
|
||||
high_res_image_std = OPENAI_CLIP_STD
|
||||
high_res_size = {"height": 1024, "width": 1024}
|
||||
high_res_resample = PILImageResampling.BICUBIC
|
||||
model_input_names = ["pixel_values", "high_res_pixel_values"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
|
||||
if kwargs.get("image_mean") is None:
|
||||
background_color = (127, 127, 127)
|
||||
else:
|
||||
|
@ -22,7 +22,6 @@ from ...cache_utils import Cache
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
get_size_dict,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
@ -43,7 +42,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...tokenization_utils_base import (
|
||||
PreTokenizedInput,
|
||||
TextInput,
|
||||
@ -430,6 +429,32 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
|
||||
return model_inputs
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
|
||||
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
|
||||
method.
|
||||
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
|
||||
overridden by the `high_res_resample` parameter in the `preprocess` method.
|
||||
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
|
||||
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
|
||||
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
|
||||
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
high_res_size: dict
|
||||
high_res_resample: "PILImageResampling"
|
||||
high_res_image_mean: list[float]
|
||||
high_res_image_std: list[float]
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
||||
r"""
|
||||
Constructs a DEEPSEEK_VL_HYBRID image processor.
|
||||
@ -483,6 +508,7 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "high_res_pixel_values"]
|
||||
valid_kwargs = DeepseekVLHybridImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -727,32 +753,6 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
|
||||
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
|
||||
method.
|
||||
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
|
||||
overridden by the `high_res_resample` parameter in the `preprocess` method.
|
||||
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
|
||||
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
|
||||
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
|
||||
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
high_res_size: dict
|
||||
high_res_resample: "PILImageResampling"
|
||||
high_res_image_mean: list[float]
|
||||
high_res_image_std: list[float]
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
|
||||
high_res_image_mean = OPENAI_CLIP_MEAN
|
||||
high_res_image_std = OPENAI_CLIP_STD
|
||||
@ -760,7 +760,7 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
|
||||
high_res_resample = PILImageResampling.BICUBIC
|
||||
model_input_names = ["pixel_values", "high_res_pixel_values"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
|
||||
if kwargs.get("image_mean") is None:
|
||||
background_color = (127, 127, 127)
|
||||
else:
|
||||
|
@ -53,6 +53,7 @@ from ...image_utils import (
|
||||
validate_kwargs,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
is_scipy_available,
|
||||
@ -79,6 +80,30 @@ if is_scipy_available():
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class DeformableDetrImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||
masks_path: Optional[Union[str, pathlib.Path]]
|
||||
|
||||
|
||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||
|
||||
|
||||
@ -827,6 +852,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = DeformableDetrImageProcessorKwargs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
|
||||
def __init__(
|
||||
|
@ -14,7 +14,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature, get_size_dict
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
SizeDict,
|
||||
get_image_size_for_max_height_width,
|
||||
get_max_height_width,
|
||||
@ -35,29 +34,11 @@ from ...image_utils import (
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring, logging
|
||||
from ...utils.import_utils import requires
|
||||
from .image_processing_deformable_detr import get_size_with_aspect_ratio
|
||||
from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs, get_size_with_aspect_ratio
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
|
||||
|
||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||
|
||||
|
||||
@ -272,9 +253,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
||||
size = {"shortest_edge": 800, "longest_edge": 1333}
|
||||
default_to_square = False
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = DeformableDetrFastImageProcessorKwargs
|
||||
valid_kwargs = DeformableDetrImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
|
||||
def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
|
||||
@ -536,25 +517,8 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
|
||||
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||
**kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[DeformableDetrImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
List of annotations associated with the image or batch of images. If annotation is for object
|
||||
detection, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
|
||||
dictionary. An image can have no annotations, in which case the list should be empty.
|
||||
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
|
||||
An image can have no segments, in which case the list should be empty.
|
||||
- "file_name" (`str`): The file name of the image.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
logger.warning_once(
|
||||
@ -569,7 +533,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
||||
)
|
||||
kwargs["size"] = kwargs.pop("max_size")
|
||||
|
||||
return super().preprocess(images, annotations, masks_path, **kwargs)
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
|
@ -52,6 +52,7 @@ from ...image_utils import (
|
||||
validate_kwargs,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
is_scipy_available,
|
||||
@ -82,6 +83,29 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||
|
||||
|
||||
class DetrImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||
masks_path: Optional[Union[str, pathlib.Path]]
|
||||
|
||||
|
||||
# From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
|
||||
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
|
||||
"""
|
||||
@ -811,6 +835,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = DetrImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -28,7 +28,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature, get_size_dict
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
SizeDict,
|
||||
get_image_size_for_max_height_width,
|
||||
get_max_height_width,
|
||||
@ -54,6 +53,7 @@ from ...utils import (
|
||||
)
|
||||
from ...utils.import_utils import requires
|
||||
from .image_processing_detr import (
|
||||
DetrImageProcessorKwargs,
|
||||
compute_segments,
|
||||
convert_segmentation_to_rle,
|
||||
get_size_with_aspect_ratio,
|
||||
@ -263,23 +263,6 @@ def prepare_coco_panoptic_annotation(
|
||||
return new_target
|
||||
|
||||
|
||||
class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@requires(backends=("torchvision", "torch"))
|
||||
class DetrImageProcessorFast(BaseImageProcessorFast):
|
||||
@ -294,9 +277,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
|
||||
size = {"shortest_edge": 800, "longest_edge": 1333}
|
||||
default_to_square = False
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = DetrFastImageProcessorKwargs
|
||||
valid_kwargs = DetrImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
|
||||
def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
|
||||
@ -558,25 +541,8 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
|
||||
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||
**kwargs: Unpack[DetrFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[DetrImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
List of annotations associated with the image or batch of images. If annotation is for object
|
||||
detection, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
|
||||
dictionary. An image can have no annotations, in which case the list should be empty.
|
||||
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
|
||||
An image can have no segments, in which case the list should be empty.
|
||||
- "file_name" (`str`): The file name of the image.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
logger.warning_once(
|
||||
@ -591,7 +557,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
|
||||
)
|
||||
kwargs["size"] = kwargs.pop("max_size")
|
||||
|
||||
return super().preprocess(images, annotations, masks_path, **kwargs)
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
|
@ -111,9 +111,7 @@ class DiaProcessor(ProcessorMixin):
|
||||
|
||||
text_kwargs = output_kwargs["text_kwargs"]
|
||||
audio_kwargs = output_kwargs["audio_kwargs"]
|
||||
common_kwargs = output_kwargs["common_kwargs"]
|
||||
|
||||
return_tensors = common_kwargs.pop("return_tensors", None)
|
||||
return_tensors = text_kwargs.get("return_tensors", None)
|
||||
if return_tensors != "pt":
|
||||
raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
|
||||
|
||||
|
@ -40,6 +40,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, logging
|
||||
from ...utils.import_utils import is_vision_available, requires
|
||||
|
||||
@ -51,6 +52,18 @@ if is_vision_available():
|
||||
import PIL
|
||||
|
||||
|
||||
class DonutImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
|
||||
Whether to resize the image using thumbnail method.
|
||||
do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
|
||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||
"""
|
||||
|
||||
do_thumbnail: Optional[bool]
|
||||
do_align_long_axis: Optional[bool]
|
||||
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class DonutImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
@ -90,6 +103,7 @@ class DonutImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = DonutImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -19,7 +19,7 @@ from typing import Optional, Union
|
||||
import torch
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
|
||||
from ...image_transforms import group_images_by_shape, reorder_images
|
||||
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
@ -28,24 +28,12 @@ from ...utils import (
|
||||
auto_docstring,
|
||||
logging,
|
||||
)
|
||||
from .image_processing_donut import DonutImageProcessorKwargs
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DonutFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
Args:
|
||||
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
|
||||
Whether to resize the image using thumbnail method.
|
||||
do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
|
||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||
"""
|
||||
|
||||
do_thumbnail: Optional[bool]
|
||||
do_align_long_axis: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class DonutImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BILINEAR
|
||||
@ -58,9 +46,9 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
|
||||
do_thumbnail = True
|
||||
do_align_long_axis = False
|
||||
do_pad = True
|
||||
valid_kwargs = DonutFastImageProcessorKwargs
|
||||
valid_kwargs = DonutImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[DonutImageProcessorKwargs]):
|
||||
size = kwargs.pop("size", None)
|
||||
if isinstance(size, (tuple, list)):
|
||||
size = size[::-1]
|
||||
@ -68,7 +56,7 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutFastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutImageProcessorKwargs]) -> BatchFeature:
|
||||
if "size" in kwargs:
|
||||
size = kwargs.pop("size")
|
||||
if isinstance(size, (tuple, list)):
|
||||
|
@ -74,8 +74,6 @@ class DonutProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[DonutProcessorKwargs],
|
||||
):
|
||||
"""
|
||||
|
@ -44,6 +44,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
filter_out_non_signature_kwargs,
|
||||
@ -63,6 +64,26 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DPTImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
ensure_multiple_of (`int`, *optional*, defaults to 1):
|
||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
|
||||
by `ensure_multiple_of` in `preprocess`.
|
||||
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
|
||||
be overridden by `keep_aspect_ratio` in `preprocess`.
|
||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||
is used for background, and background itself is not included in all classes of a dataset (e.g.
|
||||
ADE20k). The background label will be replaced by 255.
|
||||
"""
|
||||
|
||||
ensure_multiple_of: Optional[int]
|
||||
size_divisor: Optional[int]
|
||||
keep_aspect_ratio: Optional[bool]
|
||||
do_reduce_labels: Optional[bool]
|
||||
|
||||
|
||||
def get_resize_output_image_size(
|
||||
input_image: np.ndarray,
|
||||
output_size: Union[int, Iterable[int]],
|
||||
@ -151,6 +172,7 @@ class DPTImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = DPTImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -28,7 +28,7 @@ import torch
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_processing_base import BatchFeature
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast
|
||||
from ...image_transforms import group_images_by_shape, reorder_images
|
||||
from ...image_utils import (
|
||||
IMAGENET_STANDARD_MEAN,
|
||||
@ -41,35 +41,13 @@ from ...image_utils import (
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring, requires_backends
|
||||
from .image_processing_dpt import DPTImageProcessorKwargs
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ...modeling_outputs import DepthEstimatorOutput
|
||||
|
||||
|
||||
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
ensure_multiple_of (`int`, *optional*, defaults to 1):
|
||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
|
||||
by `ensure_multiple_of` in `preprocess`.
|
||||
size_divisor (`int`, *optional*):
|
||||
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
|
||||
DINOv2 paper, which uses the model in combination with DPT.
|
||||
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
|
||||
be overridden by `keep_aspect_ratio` in `preprocess`.
|
||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||
is used for background, and background itself is not included in all classes of a dataset (e.g.
|
||||
ADE20k). The background label will be replaced by 255.
|
||||
"""
|
||||
|
||||
ensure_multiple_of: Optional[int]
|
||||
size_divisor: Optional[int]
|
||||
keep_aspect_ratio: Optional[bool]
|
||||
do_reduce_labels: Optional[bool]
|
||||
|
||||
|
||||
def get_resize_output_image_size(
|
||||
input_image: "torch.Tensor",
|
||||
output_size: Union[int, Iterable[int]],
|
||||
@ -123,13 +101,13 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
|
||||
do_normalize = True
|
||||
do_reduce_labels = None
|
||||
|
||||
valid_kwargs = DPTFastImageProcessorKwargs
|
||||
valid_kwargs = DPTImageProcessorKwargs
|
||||
do_pad = False
|
||||
rescale_factor = 1 / 255
|
||||
ensure_multiple_of = 1
|
||||
keep_aspect_ratio = False
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[DPTImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def reduce_label(self, labels: list["torch.Tensor"]):
|
||||
@ -147,7 +125,7 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
|
||||
self,
|
||||
images: ImageInput,
|
||||
segmentation_maps: Optional[ImageInput] = None,
|
||||
**kwargs: Unpack[DPTFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[DPTImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
segmentation_maps (`ImageInput`, *optional*):
|
||||
@ -162,7 +140,7 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
|
||||
do_convert_rgb: bool,
|
||||
input_data_format: ChannelDimension,
|
||||
device: Optional[Union[str, "torch.device"]] = None,
|
||||
**kwargs: Unpack[DPTFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[DPTImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Preprocess image-like inputs.
|
||||
|
@ -21,7 +21,7 @@ from typing import TYPE_CHECKING, Optional, Union
|
||||
import torch
|
||||
|
||||
from ...image_processing_base import BatchFeature
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast
|
||||
from ...image_transforms import group_images_by_shape, reorder_images
|
||||
from ...image_utils import (
|
||||
IMAGENET_STANDARD_MEAN,
|
||||
@ -35,6 +35,7 @@ from ...utils import (
|
||||
requires_backends,
|
||||
)
|
||||
from ..beit.image_processing_beit_fast import BeitImageProcessorFast
|
||||
from .image_processing_dpt import DPTImageProcessorKwargs
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -82,29 +83,6 @@ def get_resize_output_image_size(
|
||||
return SizeDict(height=new_height, width=new_width)
|
||||
|
||||
|
||||
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
ensure_multiple_of (`int`, *optional*, defaults to 1):
|
||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
|
||||
by `ensure_multiple_of` in `preprocess`.
|
||||
size_divisor (`int`, *optional*):
|
||||
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
|
||||
DINOv2 paper, which uses the model in combination with DPT.
|
||||
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
|
||||
be overridden by `keep_aspect_ratio` in `preprocess`.
|
||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||
is used for background, and background itself is not included in all classes of a dataset (e.g.
|
||||
ADE20k). The background label will be replaced by 255.
|
||||
"""
|
||||
|
||||
ensure_multiple_of: Optional[int]
|
||||
size_divisor: Optional[int]
|
||||
keep_aspect_ratio: Optional[bool]
|
||||
do_reduce_labels: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class DPTImageProcessorFast(BeitImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
@ -123,7 +101,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
|
||||
do_center_crop = None
|
||||
do_reduce_labels = None
|
||||
|
||||
valid_kwargs = DPTFastImageProcessorKwargs
|
||||
valid_kwargs = DPTImageProcessorKwargs
|
||||
|
||||
def resize(
|
||||
self,
|
||||
|
@ -34,6 +34,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, logging, requires_backends
|
||||
|
||||
|
||||
@ -49,6 +50,15 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
do_grayscale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
||||
"""
|
||||
|
||||
do_grayscale: Optional[bool] = True
|
||||
|
||||
|
||||
# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
|
||||
def is_grayscale(
|
||||
image: np.ndarray,
|
||||
@ -155,6 +165,7 @@ class EfficientLoFTRImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = EfficientLoFTRImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -22,7 +22,6 @@ from PIL import Image, ImageDraw
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -40,6 +39,7 @@ from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
)
|
||||
from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -108,15 +108,6 @@ def convert_to_grayscale(
|
||||
return F.rgb_to_grayscale(image, num_output_channels=3)
|
||||
|
||||
|
||||
class EfficientLoFTRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
do_grayscale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
||||
"""
|
||||
|
||||
do_grayscale: Optional[bool] = True
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BILINEAR
|
||||
@ -126,13 +117,13 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
|
||||
do_rescale = True
|
||||
rescale_factor = 1 / 255
|
||||
do_normalize = None
|
||||
valid_kwargs = EfficientLoFTRFastImageProcessorKwargs
|
||||
valid_kwargs = EfficientLoFTRImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _prepare_images_structure(
|
||||
|
@ -33,6 +33,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
|
||||
|
||||
@ -43,6 +44,18 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class EfficientNetImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
|
||||
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
|
||||
include_top (`bool`, *optional*, defaults to `self.include_top`):
|
||||
Normalize the image again with the standard deviation only for image classification if set to True.
|
||||
"""
|
||||
|
||||
rescale_offset: bool
|
||||
include_top: bool
|
||||
|
||||
|
||||
class EfficientNetImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a EfficientNet image processor.
|
||||
@ -83,6 +96,7 @@ class EfficientNetImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = EfficientNetImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -20,7 +20,7 @@ from typing import Optional, Union
|
||||
import torch
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
|
||||
from ...image_transforms import group_images_by_shape, reorder_images
|
||||
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
@ -28,19 +28,7 @@ from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
)
|
||||
|
||||
|
||||
class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
Args:
|
||||
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
|
||||
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
|
||||
include_top (`bool`, *optional*, defaults to `self.include_top`):
|
||||
Normalize the image again with the standard deviation only for image classification if set to True.
|
||||
"""
|
||||
|
||||
rescale_offset: bool
|
||||
include_top: bool
|
||||
from .image_processing_efficientnet import EfficientNetImageProcessorKwargs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -57,9 +45,9 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast):
|
||||
rescale_offset = False
|
||||
do_normalize = True
|
||||
include_top = True
|
||||
valid_kwargs = EfficientNetFastImageProcessorKwargs
|
||||
valid_kwargs = EfficientNetImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[EfficientNetImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def rescale(
|
||||
@ -195,7 +183,7 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast):
|
||||
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
|
||||
|
@ -37,6 +37,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, is_vision_available, logging
|
||||
|
||||
|
||||
@ -46,6 +47,11 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Emu3ImageProcessorKwargs(ImagesKwargs):
|
||||
ratio: Optional[str]
|
||||
image_area: Optional[int]
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
|
||||
):
|
||||
@ -108,6 +114,7 @@ class Emu3ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "image_sizes"]
|
||||
valid_kwargs = Emu3ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -20,7 +20,7 @@ import numpy as np
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import is_vision_available
|
||||
|
||||
@ -33,14 +33,8 @@ class Emu3TextKwargs(TextKwargs, total=False):
|
||||
return_for_image_generation: bool
|
||||
|
||||
|
||||
class Emu3ImagesKwargs(ImagesKwargs, total=False):
|
||||
ratio: str
|
||||
image_area: int
|
||||
|
||||
|
||||
class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
text_kwargs: Emu3TextKwargs
|
||||
images_kwargs: Emu3ImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"return_for_image_generation": False,
|
||||
@ -95,8 +89,6 @@ class Emu3Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Emu3ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -36,6 +36,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
IMAGENET_DEFAULT_MEAN,
|
||||
IMAGENET_DEFAULT_STD,
|
||||
@ -53,6 +54,21 @@ if is_torch_available():
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class EomtImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
do_split_image (`bool`, *optional*, defaults to `False`):
|
||||
Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
|
||||
input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
|
||||
Otherwise, the input images will be padded to the target size.
|
||||
ignore_index (`int`, *optional*):
|
||||
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
||||
denoted with 0 (background) will be replaced with `ignore_index`.
|
||||
"""
|
||||
|
||||
do_split_image: bool
|
||||
ignore_index: Optional[int] = None
|
||||
|
||||
|
||||
# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
|
||||
def convert_segmentation_map_to_binary_masks(
|
||||
segmentation_map: np.ndarray,
|
||||
|
@ -24,7 +24,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -43,6 +42,7 @@ from ...utils import (
|
||||
filter_out_non_signature_kwargs,
|
||||
)
|
||||
from .image_processing_eomt import (
|
||||
EomtImageProcessorKwargs,
|
||||
compute_segments,
|
||||
convert_segmentation_map_to_binary_masks,
|
||||
get_size_with_aspect_ratio,
|
||||
@ -50,25 +50,6 @@ from .image_processing_eomt import (
|
||||
)
|
||||
|
||||
|
||||
class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
do_split_image (`bool`, *optional*, defaults to `False`):
|
||||
Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
|
||||
input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
|
||||
Otherwise, the input images will be padded to the target size.
|
||||
do_pad (`bool`, *optional*, defaults to `False`):
|
||||
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
|
||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||
ignore_index (`int`, *optional*):
|
||||
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
||||
denoted with 0 (background) will be replaced with `ignore_index`.
|
||||
"""
|
||||
|
||||
do_split_image: bool
|
||||
do_pad: bool
|
||||
ignore_index: Optional[int] = None
|
||||
|
||||
|
||||
def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
|
||||
"""Returns the height and width from a size dict."""
|
||||
target_height = size_dict["shortest_edge"]
|
||||
@ -102,9 +83,9 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
||||
do_split_image = False
|
||||
do_pad = False
|
||||
ignore_index = None
|
||||
valid_kwargs = EomtImageProcessorFastKwargs
|
||||
valid_kwargs = EomtImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[EomtImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]:
|
||||
@ -153,7 +134,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
||||
images: ImageInput,
|
||||
segmentation_maps: Optional[list[torch.Tensor]] = None,
|
||||
instance_id_to_semantic_id: Optional[dict[int, int]] = None,
|
||||
**kwargs: Unpack[EomtImageProcessorFastKwargs],
|
||||
**kwargs: Unpack[EomtImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
segmentation_maps (`ImageInput`, *optional*):
|
||||
@ -171,7 +152,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
||||
do_convert_rgb: bool,
|
||||
input_data_format: ChannelDimension,
|
||||
device: Optional[Union[str, "torch.device"]] = None,
|
||||
**kwargs: Unpack[EomtImageProcessorFastKwargs],
|
||||
**kwargs: Unpack[EomtImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Preprocess image-like inputs.
|
||||
|
@ -37,6 +37,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
from ...utils.import_utils import requires
|
||||
|
||||
@ -56,6 +57,89 @@ FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
|
||||
LOGIT_LAPLACE_EPS: float = 0.1
|
||||
|
||||
|
||||
class FlavaImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
return_image_mask (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
|
||||
input_size_patches (`int`, *optional*, defaults to 14):
|
||||
Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
|
||||
by the `input_size_patches` parameter in `preprocess`.
|
||||
total_mask_patches (`int`, *optional*, defaults to 75):
|
||||
Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
|
||||
`preprocess`.
|
||||
mask_group_min_patches (`int`, *optional*, defaults to 16):
|
||||
Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
|
||||
parameter in `preprocess`.
|
||||
mask_group_max_patches (`int`, *optional*):
|
||||
Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
|
||||
parameter in `preprocess`.
|
||||
mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
|
||||
Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
|
||||
in `preprocess`.
|
||||
mask_group_max_aspect_ratio (`float`, *optional*):
|
||||
Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
|
||||
in `preprocess`.
|
||||
return_codebook_pixels (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return the codebook pixel values.
|
||||
codebook_do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
|
||||
parameter in `preprocess`. `codebook_size`.
|
||||
codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||
Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
|
||||
`preprocess`.
|
||||
codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
|
||||
Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
|
||||
parameter in `preprocess`.
|
||||
codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
|
||||
Whether to crop the input for codebook at the center. If the input size is smaller than
|
||||
`codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
|
||||
overridden by the `codebook_do_center_crop` parameter in `preprocess`.
|
||||
codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||
Desired output size for codebook input when applying center-cropping. Can be overridden by the
|
||||
`codebook_crop_size` parameter in `preprocess`.
|
||||
codebook_do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
|
||||
overridden by the `codebook_do_rescale` parameter in `preprocess`.
|
||||
codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
|
||||
`codebook_rescale_factor` parameter in `preprocess`.
|
||||
codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
|
||||
Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
|
||||
`codebook_do_map_pixels` parameter in `preprocess`.
|
||||
codebook_do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
|
||||
be overridden by the `codebook_do_normalize` parameter in `preprocess`.
|
||||
codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
|
||||
The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
|
||||
by the `codebook_image_mean` parameter in `preprocess`.
|
||||
codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
|
||||
The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
|
||||
be overridden by the `codebook_image_std` parameter in `preprocess`.
|
||||
"""
|
||||
|
||||
# Mask related params
|
||||
return_image_mask: Optional[bool]
|
||||
input_size_patches: Optional[int]
|
||||
total_mask_patches: Optional[int]
|
||||
mask_group_min_patches: Optional[int]
|
||||
mask_group_max_patches: Optional[int]
|
||||
mask_group_min_aspect_ratio: Optional[float]
|
||||
mask_group_max_aspect_ratio: Optional[float]
|
||||
# Codebook related params
|
||||
return_codebook_pixels: Optional[bool]
|
||||
codebook_do_resize: Optional[bool]
|
||||
codebook_size: Optional[bool]
|
||||
codebook_resample: Optional[int]
|
||||
codebook_do_center_crop: Optional[bool]
|
||||
codebook_crop_size: Optional[int]
|
||||
codebook_do_rescale: Optional[bool]
|
||||
codebook_rescale_factor: Optional[Union[int, float]]
|
||||
codebook_do_map_pixels: Optional[bool]
|
||||
codebook_do_normalize: Optional[bool]
|
||||
codebook_image_mean: Optional[Union[float, Iterable[float]]]
|
||||
codebook_image_std: Optional[Union[float, Iterable[float]]]
|
||||
|
||||
|
||||
# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
|
||||
class FlavaMaskingGenerator:
|
||||
def __init__(
|
||||
@ -225,6 +309,7 @@ class FlavaImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = FlavaImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -16,7 +16,6 @@
|
||||
|
||||
import math
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from functools import lru_cache
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
@ -26,7 +25,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
get_size_dict,
|
||||
)
|
||||
from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
|
||||
@ -42,6 +40,7 @@ from .image_processing_flava import (
|
||||
FLAVA_IMAGE_MEAN,
|
||||
FLAVA_IMAGE_STD,
|
||||
LOGIT_LAPLACE_EPS,
|
||||
FlavaImageProcessorKwargs,
|
||||
)
|
||||
|
||||
|
||||
@ -121,90 +120,6 @@ class FlavaMaskingGenerator:
|
||||
return mask
|
||||
|
||||
|
||||
class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
Args:
|
||||
return_image_mask (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
|
||||
input_size_patches (`int`, *optional*, defaults to 14):
|
||||
Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
|
||||
by the `input_size_patches` parameter in `preprocess`.
|
||||
total_mask_patches (`int`, *optional*, defaults to 75):
|
||||
Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
|
||||
`preprocess`.
|
||||
mask_group_min_patches (`int`, *optional*, defaults to 16):
|
||||
Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
|
||||
parameter in `preprocess`.
|
||||
mask_group_max_patches (`int`, *optional*):
|
||||
Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
|
||||
parameter in `preprocess`.
|
||||
mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
|
||||
Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
|
||||
in `preprocess`.
|
||||
mask_group_max_aspect_ratio (`float`, *optional*):
|
||||
Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
|
||||
in `preprocess`.
|
||||
return_codebook_pixels (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return the codebook pixel values.
|
||||
codebook_do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
|
||||
parameter in `preprocess`. `codebook_size`.
|
||||
codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||
Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
|
||||
`preprocess`.
|
||||
codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
|
||||
Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
|
||||
parameter in `preprocess`.
|
||||
codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
|
||||
Whether to crop the input for codebook at the center. If the input size is smaller than
|
||||
`codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
|
||||
overridden by the `codebook_do_center_crop` parameter in `preprocess`.
|
||||
codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||
Desired output size for codebook input when applying center-cropping. Can be overridden by the
|
||||
`codebook_crop_size` parameter in `preprocess`.
|
||||
codebook_do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
|
||||
overridden by the `codebook_do_rescale` parameter in `preprocess`.
|
||||
codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
|
||||
`codebook_rescale_factor` parameter in `preprocess`.
|
||||
codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
|
||||
Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
|
||||
`codebook_do_map_pixels` parameter in `preprocess`.
|
||||
codebook_do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
|
||||
be overridden by the `codebook_do_normalize` parameter in `preprocess`.
|
||||
codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
|
||||
The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
|
||||
by the `codebook_image_mean` parameter in `preprocess`.
|
||||
codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
|
||||
The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
|
||||
be overridden by the `codebook_image_std` parameter in `preprocess`.
|
||||
"""
|
||||
|
||||
# Mask related params
|
||||
return_image_mask: Optional[bool]
|
||||
input_size_patches: Optional[int]
|
||||
total_mask_patches: Optional[int]
|
||||
mask_group_min_patches: Optional[int]
|
||||
mask_group_max_patches: Optional[int]
|
||||
mask_group_min_aspect_ratio: Optional[float]
|
||||
mask_group_max_aspect_ratio: Optional[float]
|
||||
# Codebook related params
|
||||
return_codebook_pixels: Optional[bool]
|
||||
codebook_do_resize: Optional[bool]
|
||||
codebook_size: Optional[bool]
|
||||
codebook_resample: Optional[int]
|
||||
codebook_do_center_crop: Optional[bool]
|
||||
codebook_crop_size: Optional[int]
|
||||
codebook_do_rescale: Optional[bool]
|
||||
codebook_rescale_factor: Optional[Union[int, float]]
|
||||
codebook_do_map_pixels: Optional[bool]
|
||||
codebook_do_normalize: Optional[bool]
|
||||
codebook_image_mean: Optional[Union[float, Iterable[float]]]
|
||||
codebook_image_std: Optional[Union[float, Iterable[float]]]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class FlavaImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
@ -239,13 +154,13 @@ class FlavaImageProcessorFast(BaseImageProcessorFast):
|
||||
codebook_do_normalize = True
|
||||
codebook_image_mean = FLAVA_CODEBOOK_MEAN
|
||||
codebook_image_std = FLAVA_CODEBOOK_STD
|
||||
valid_kwargs = FlavaFastImageProcessorKwargs
|
||||
valid_kwargs = FlavaImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[FlavaImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[FlavaImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
@classmethod
|
||||
|
@ -17,39 +17,8 @@ Image/Text processor class for FLAVA
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class FlavaImagesKwargs(ImagesKwargs):
|
||||
# Mask related params
|
||||
return_image_mask: Optional[bool]
|
||||
input_size_patches: Optional[int]
|
||||
total_mask_patches: Optional[int]
|
||||
mask_group_min_patches: Optional[int]
|
||||
mask_group_max_patches: Optional[int]
|
||||
mask_group_min_aspect_ratio: Optional[float]
|
||||
mask_group_max_aspect_ratio: Optional[float]
|
||||
# Codebook related params
|
||||
return_codebook_pixels: Optional[bool]
|
||||
codebook_do_resize: Optional[bool]
|
||||
codebook_size: Optional[bool]
|
||||
codebook_resample: Optional[int]
|
||||
codebook_do_center_crop: Optional[bool]
|
||||
codebook_crop_size: Optional[int]
|
||||
codebook_do_rescale: Optional[bool]
|
||||
codebook_rescale_factor: Optional[Union[int, float]]
|
||||
codebook_do_map_pixels: Optional[bool]
|
||||
codebook_do_normalize: Optional[bool]
|
||||
codebook_image_mean: Optional[Union[float, Iterable[float]]]
|
||||
codebook_image_std: Optional[Union[float, Iterable[float]]]
|
||||
|
||||
|
||||
class FlavaProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: FlavaImagesKwargs
|
||||
_defaults = {}
|
||||
from ...processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class FlavaProcessor(ProcessorMixin):
|
||||
@ -67,7 +36,6 @@ class FlavaProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "FlavaImageProcessor"
|
||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||
valid_processor_kwargs = FlavaProcessorKwargs
|
||||
|
||||
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||
feature_extractor = None
|
||||
|
@ -39,7 +39,6 @@ logger = logging.get_logger(__name__)
|
||||
class Florence2ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {
|
||||
"text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
|
||||
"images_kwargs": {},
|
||||
}
|
||||
|
||||
|
||||
|
@ -71,7 +71,6 @@ class FuyuProcessorKwargs(ProcessingKwargs, total=False):
|
||||
"verbose": True,
|
||||
"return_mm_token_type_ids": False,
|
||||
},
|
||||
"images_kwargs": {},
|
||||
}
|
||||
|
||||
|
||||
@ -487,8 +486,6 @@ class FuyuProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[FuyuProcessorKwargs],
|
||||
) -> "FuyuBatchFeature":
|
||||
"""
|
||||
|
@ -40,6 +40,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
|
||||
|
||||
@ -50,6 +51,24 @@ if is_vision_available():
|
||||
import PIL
|
||||
|
||||
|
||||
class Gemma3ImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
do_pan_and_scan (`bool`, *optional*):
|
||||
Whether to apply `pan_and_scan` to images.
|
||||
pan_and_scan_min_crop_size (`int`, *optional*):
|
||||
Minimum size of each crop in pan and scan.
|
||||
pan_and_scan_max_num_crops (`int`, *optional*):
|
||||
Maximum number of crops per image in pan and scan.
|
||||
pan_and_scan_min_ratio_to_activate (`float`, *optional*):
|
||||
Minimum aspect ratio to activate pan and scan.
|
||||
"""
|
||||
|
||||
do_pan_and_scan: Optional[bool]
|
||||
pan_and_scan_min_crop_size: Optional[int]
|
||||
pan_and_scan_max_num_crops: Optional[int]
|
||||
pan_and_scan_min_ratio_to_activate: Optional[float]
|
||||
|
||||
|
||||
class Gemma3ImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a SigLIP image processor.
|
||||
@ -91,6 +110,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "num_crops"]
|
||||
valid_kwargs = Gemma3ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -24,7 +24,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -35,29 +34,12 @@ from ...utils import (
|
||||
auto_docstring,
|
||||
logging,
|
||||
)
|
||||
from .image_processing_gemma3 import Gemma3ImageProcessorKwargs
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
do_pan_and_scan (`bool`, *optional*):
|
||||
Whether to apply `pan_and_scan` to images.
|
||||
pan_and_scan_min_crop_size (`int`, *optional*):
|
||||
Minimum size of each crop in pan and scan.
|
||||
pan_and_scan_max_num_crops (`int`, *optional*):
|
||||
Maximum number of crops per image in pan and scan.
|
||||
pan_and_scan_min_ratio_to_activate (`float`, *optional*):
|
||||
Minimum aspect ratio to activate pan and scan.
|
||||
"""
|
||||
|
||||
do_pan_and_scan: Optional[bool]
|
||||
pan_and_scan_min_crop_size: Optional[int]
|
||||
pan_and_scan_max_num_crops: Optional[int]
|
||||
pan_and_scan_min_ratio_to_activate: Optional[float]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Gemma3ImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BILINEAR
|
||||
@ -73,9 +55,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
|
||||
pan_and_scan_min_crop_size = None
|
||||
pan_and_scan_max_num_crops = None
|
||||
pan_and_scan_min_ratio_to_activate = None
|
||||
valid_kwargs = Gemma3FastImageProcessorKwargs
|
||||
valid_kwargs = Gemma3ImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[Gemma3ImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def pan_and_scan_batched(
|
||||
@ -167,7 +149,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
**kwargs: Unpack[Gemma3FastImageProcessorKwargs],
|
||||
**kwargs: Unpack[Gemma3ImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
|
@ -20,21 +20,12 @@ import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, make_nested_list_of_images
|
||||
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import to_py_obj
|
||||
|
||||
|
||||
class Gemma3ImagesKwargs(ImagesKwargs):
|
||||
do_pan_and_scan: Optional[bool]
|
||||
pan_and_scan_min_crop_size: Optional[int]
|
||||
pan_and_scan_max_num_crops: Optional[int]
|
||||
pan_and_scan_min_ratio_to_activate: Optional[float]
|
||||
do_convert_rgb: Optional[bool]
|
||||
|
||||
|
||||
class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: Gemma3ImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
@ -81,8 +72,6 @@ class Gemma3Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
videos=None,
|
||||
audio=None,
|
||||
**kwargs: Unpack[Gemma3ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
if text is None and images is None:
|
||||
|
@ -19,21 +19,13 @@ import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, make_nested_list_of_images
|
||||
from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
|
||||
class Gemma3nImagesKwargs(ImagesKwargs):
|
||||
do_convert_rgb: Optional[bool]
|
||||
|
||||
|
||||
class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
|
||||
audio_kwargs: AudioKwargs
|
||||
images_kwargs: Gemma3nImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"text_kwargs": {"padding": False},
|
||||
}
|
||||
|
||||
|
||||
@ -101,7 +93,6 @@ class Gemma3nProcessor(ProcessorMixin):
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] = None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Gemma3nProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
if text is None and images is None and audio is None:
|
||||
|
@ -39,6 +39,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, logging
|
||||
from ...video_utils import VideoInput
|
||||
|
||||
@ -46,6 +47,21 @@ from ...video_utils import VideoInput
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Glm4vImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
patch_size (`int`, *optional*, defaults to 14):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to 2):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
"""
|
||||
|
||||
patch_size: Optional[int]
|
||||
temporal_patch_size: Optional[int]
|
||||
merge_size: Optional[int]
|
||||
|
||||
|
||||
def smart_resize(
|
||||
num_frames: int,
|
||||
height: int,
|
||||
@ -120,6 +136,7 @@ class Glm4vImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "image_grid_thw"]
|
||||
valid_kwargs = Glm4vImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -24,7 +24,6 @@ from ...image_processing_utils import (
|
||||
)
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -41,27 +40,12 @@ from ...utils import (
|
||||
auto_docstring,
|
||||
logging,
|
||||
)
|
||||
from .image_processing_glm4v import smart_resize
|
||||
from .image_processing_glm4v import Glm4vImageProcessorKwargs, smart_resize
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
patch_size (`int`, *optional*, defaults to 14):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to 2):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
"""
|
||||
|
||||
patch_size: Optional[int]
|
||||
temporal_patch_size: Optional[int]
|
||||
merge_size: Optional[int]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
do_resize = True
|
||||
@ -75,10 +59,10 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
patch_size = 14
|
||||
temporal_patch_size = 2
|
||||
merge_size = 2
|
||||
valid_kwargs = Glm4vFastImageProcessorKwargs
|
||||
valid_kwargs = Glm4vImageProcessorKwargs
|
||||
model_input_names = ["pixel_values", "image_grid_thw"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[Glm4vImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
if self.size is not None and (
|
||||
self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
|
||||
@ -205,7 +189,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
**kwargs: Unpack[Glm4vFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[Glm4vImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
|
@ -32,7 +32,7 @@ from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPast
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...processing_utils import Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
|
||||
from ...utils.generic import check_model_inputs
|
||||
@ -52,7 +52,6 @@ from ..qwen2_5_vl.modeling_qwen2_5_vl import (
|
||||
Qwen2_5_VLVisionAttention,
|
||||
Qwen2_5_VLVisionBlock,
|
||||
)
|
||||
from ..qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLVideosProcessorKwargs
|
||||
from ..qwen2_vl.processing_qwen2_vl import (
|
||||
Qwen2_VLProcessor,
|
||||
Qwen2_VLProcessorKwargs,
|
||||
@ -1508,19 +1507,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||
return image_counts, video_counts
|
||||
|
||||
|
||||
class Glm4vVideosProcessorKwargs(Qwen2_5_VLVideosProcessorKwargs):
|
||||
pass
|
||||
|
||||
|
||||
class Glm4vImagesKwargs(ImagesKwargs):
|
||||
patch_size: Optional[int]
|
||||
temporal_patch_size: Optional[int]
|
||||
merge_size: Optional[int]
|
||||
|
||||
|
||||
class Glm4vProcessorKwargs(Qwen2_VLProcessorKwargs):
|
||||
images_kwargs: Glm4vImagesKwargs
|
||||
videos_kwargs: Glm4vVideosProcessorKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
|
@ -24,7 +24,7 @@ import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import logging
|
||||
from ...video_utils import VideoInput
|
||||
@ -33,18 +33,7 @@ from ...video_utils import VideoInput
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
|
||||
fps: Union[list[float], float]
|
||||
|
||||
|
||||
class Glm4vImagesKwargs(ImagesKwargs):
|
||||
patch_size: Optional[int]
|
||||
temporal_patch_size: Optional[int]
|
||||
merge_size: Optional[int]
|
||||
|
||||
|
||||
class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: Glm4vImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
@ -53,7 +42,6 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
|
||||
},
|
||||
"videos_kwargs": {"return_metadata": True},
|
||||
}
|
||||
videos_kwargs: Glm4vVideosProcessorKwargs
|
||||
|
||||
|
||||
class Glm4vProcessor(ProcessorMixin):
|
||||
|
@ -37,12 +37,11 @@ from .image_processing_glm4v import smart_resize
|
||||
|
||||
|
||||
class Glm4vVideoProcessorInitKwargs(VideosKwargs):
|
||||
max_image_size: dict[str, int] = None
|
||||
patch_size: Optional[int] = None
|
||||
temporal_patch_size: Optional[int] = None
|
||||
merge_size: Optional[int] = None
|
||||
image_mean: Optional[list[float]] = None
|
||||
image_std: Optional[list[float]] = None
|
||||
max_image_size: Optional[dict[str, int]]
|
||||
patch_size: Optional[int]
|
||||
temporal_patch_size: Optional[int]
|
||||
merge_size: Optional[int]
|
||||
max_duration: Optional[int]
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
|
@ -38,6 +38,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
|
||||
|
||||
@ -48,6 +49,24 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class GotOcr2ImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||
`preprocess` method.
|
||||
min_patches (`int`, *optional*, defaults to 1):
|
||||
The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
|
||||
set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
|
||||
max_patches (`int`, *optional*, defaults to 12):
|
||||
The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
|
||||
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
crop_to_patches: Optional[bool]
|
||||
min_patches: Optional[int]
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
# Similar to image_processing_mllama.get_all_supported_aspect_ratios
|
||||
@lru_cache(maxsize=10)
|
||||
def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]:
|
||||
@ -168,6 +187,7 @@ class GotOcr2ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = GotOcr2ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -32,25 +31,7 @@ from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
)
|
||||
from .image_processing_got_ocr2 import get_optimal_tiled_canvas
|
||||
|
||||
|
||||
class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||
`preprocess` method.
|
||||
min_patches (`int`, *optional*, defaults to 1):
|
||||
The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
|
||||
set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
|
||||
max_patches (`int`, *optional*, defaults to 12):
|
||||
The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
|
||||
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
crop_to_patches: Optional[bool]
|
||||
min_patches: Optional[int]
|
||||
max_patches: Optional[int]
|
||||
from .image_processing_got_ocr2 import GotOcr2ImageProcessorKwargs, get_optimal_tiled_canvas
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -66,13 +47,13 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
|
||||
crop_to_patches = False
|
||||
min_patches = 1
|
||||
max_patches = 12
|
||||
valid_kwargs = GotOcr2FastImageProcessorKwargs
|
||||
valid_kwargs = GotOcr2ImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def crop_image_to_patches(
|
||||
|
@ -18,11 +18,10 @@ from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import is_vision_available, logging
|
||||
|
||||
|
||||
@ -37,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False):
|
||||
|
||||
|
||||
class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
|
||||
crop_to_patches: Optional[bool]
|
||||
min_patches: Optional[int]
|
||||
max_patches: Optional[int]
|
||||
box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
|
||||
color: Optional[str]
|
||||
num_image_tokens: Optional[int]
|
||||
multi_page: Optional[bool]
|
||||
crop_to_patches: Optional[bool]
|
||||
min_patches: Optional[int]
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
@ -136,8 +135,6 @@ class GotOcr2Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[GotOcr2ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -49,8 +49,6 @@ class GraniteSpeechProcessor(ProcessorMixin):
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
|
||||
audio: Union["torch.Tensor", list["torch.Tensor"]] = None,
|
||||
device: str = "cpu",
|
||||
images=None,
|
||||
videos=None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
requires_backends(self, ["torch"])
|
||||
|
@ -51,6 +51,7 @@ from ...image_utils import (
|
||||
validate_kwargs,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
ExplicitEnum,
|
||||
TensorType,
|
||||
@ -91,6 +92,29 @@ class AnnotationFormat(ExplicitEnum):
|
||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||
|
||||
|
||||
class GroundingDinoImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
Annotations to transform according to the padding that is applied to the images.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||
masks_path: Optional[Union[str, pathlib.Path]]
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
|
||||
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
|
||||
"""
|
||||
@ -865,6 +889,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = GroundingDinoImageProcessorKwargs
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
|
||||
def __init__(
|
||||
|
@ -4,6 +4,26 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_grounding_dino.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pathlib
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
@ -14,7 +34,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature, get_size_dict
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
SizeDict,
|
||||
get_image_size_for_max_height_width,
|
||||
get_max_height_width,
|
||||
@ -35,7 +54,7 @@ from ...image_utils import (
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring, logging
|
||||
from ...utils.import_utils import requires
|
||||
from .image_processing_grounding_dino import get_size_with_aspect_ratio
|
||||
from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs, get_size_with_aspect_ratio
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -44,24 +63,6 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||
do_convert_annotations (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return segmentation masks.
|
||||
"""
|
||||
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
|
||||
|
||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||
|
||||
|
||||
@ -304,9 +305,9 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
||||
size = {"shortest_edge": 800, "longest_edge": 1333}
|
||||
default_to_square = False
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
valid_kwargs = GroundingDinoFastImageProcessorKwargs
|
||||
valid_kwargs = GroundingDinoImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None:
|
||||
def __init__(self, **kwargs: Unpack[GroundingDinoImageProcessorKwargs]) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
|
||||
@ -568,25 +569,8 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
|
||||
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||
**kwargs: Unpack[GroundingDinoFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[GroundingDinoImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
|
||||
List of annotations associated with the image or batch of images. If annotation is for object
|
||||
detection, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
|
||||
dictionary. An image can have no annotations, in which case the list should be empty.
|
||||
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
|
||||
- "image_id" (`int`): The image id.
|
||||
- "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
|
||||
An image can have no segments, in which case the list should be empty.
|
||||
- "file_name" (`str`): The file name of the image.
|
||||
masks_path (`str` or `pathlib.Path`, *optional*):
|
||||
Path to the directory containing the segmentation masks.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
logger.warning_once(
|
||||
@ -601,7 +585,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
||||
)
|
||||
kwargs["size"] = kwargs.pop("max_size")
|
||||
|
||||
return super().preprocess(images, annotations, masks_path, **kwargs)
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
|
@ -1,3 +1,23 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
import torch
|
||||
|
@ -16,13 +16,12 @@
|
||||
Processor class for Grounding DINO.
|
||||
"""
|
||||
|
||||
import pathlib
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from ...image_transforms import center_to_corners_format
|
||||
from ...image_utils import AnnotationFormat, ImageInput
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
||||
from ...utils import TensorType, is_torch_available
|
||||
|
||||
@ -99,16 +98,7 @@ class DictWithDeprecationWarning(dict):
|
||||
return super().get(key, *args, **kwargs)
|
||||
|
||||
|
||||
class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
|
||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||
return_segmentation_masks: Optional[bool]
|
||||
masks_path: Optional[Union[str, pathlib.Path]]
|
||||
do_convert_annotations: Optional[bool]
|
||||
format: Optional[Union[str, AnnotationFormat]]
|
||||
|
||||
|
||||
class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: GroundingDinoImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"add_special_tokens": True,
|
||||
|
@ -28,6 +28,7 @@ from ...image_utils import (
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, is_torch_available
|
||||
|
||||
|
||||
@ -35,6 +36,20 @@ IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
|
||||
class IdeficsImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
transform (`Callable`, *optional*):
|
||||
A custom transform function that accepts a single image can be passed for training. For example,
|
||||
`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
|
||||
assumed - and then a preset of inference-specific transforms will be applied to the images
|
||||
image_size (`dict[str, int]`, *optional*):
|
||||
Resize to image size
|
||||
"""
|
||||
|
||||
transform: Optional[Callable]
|
||||
image_size: Optional[dict[str, int]]
|
||||
|
||||
|
||||
def convert_to_rgb(image):
|
||||
# `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
|
||||
# for transparent images. The call to `alpha_composite` handles this case
|
||||
@ -74,6 +89,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = IdeficsImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -16,13 +16,12 @@
|
||||
Processor class for IDEFICS.
|
||||
"""
|
||||
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import (
|
||||
ImagesKwargs,
|
||||
ProcessingKwargs,
|
||||
ProcessorMixin,
|
||||
TextKwargs,
|
||||
@ -40,13 +39,6 @@ if is_torch_available():
|
||||
IMAGE_TOKEN = "<image>"
|
||||
|
||||
|
||||
class IdeficsImagesKwargs(ImagesKwargs, total=False):
|
||||
transform: Optional[Callable]
|
||||
image_size: Optional[dict[str, int]]
|
||||
image_mean: Optional[Union[float, list[float]]]
|
||||
image_std: Optional[Union[float, list[float]]]
|
||||
|
||||
|
||||
class IdeficsTextKwargs(TextKwargs, total=False):
|
||||
add_eos_token: Optional[bool]
|
||||
add_end_of_utterance_token: Optional[bool]
|
||||
@ -54,14 +46,12 @@ class IdeficsTextKwargs(TextKwargs, total=False):
|
||||
|
||||
class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
|
||||
text_kwargs: IdeficsTextKwargs
|
||||
images_kwargs: IdeficsImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"add_special_tokens": False,
|
||||
"padding": "longest",
|
||||
"add_eos_token": False,
|
||||
},
|
||||
"images_kwargs": {},
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
}
|
||||
|
||||
@ -198,8 +188,6 @@ class IdeficsProcessor(ProcessorMixin):
|
||||
list[list[TextInput]],
|
||||
list[list[PreTokenizedInput]],
|
||||
] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[IdeficsProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""This method takes batched or non-batched prompts made of text and images and converts them into prompts that
|
||||
|
@ -35,6 +35,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, is_vision_available, logging
|
||||
|
||||
|
||||
@ -46,6 +47,15 @@ if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class Idefics2ImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
do_image_splitting (`bool`, *optional*, defaults to `False`):
|
||||
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
|
||||
"""
|
||||
|
||||
do_image_splitting: Optional[bool]
|
||||
|
||||
|
||||
def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
|
||||
"""
|
||||
Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
|
||||
@ -186,6 +196,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_attention_mask"]
|
||||
valid_kwargs = Idefics2ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -21,7 +21,6 @@ import torch
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
SizeDict,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
@ -35,7 +34,7 @@ from ...image_utils import (
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
|
||||
from .image_processing_idefics2 import convert_to_rgb
|
||||
from .image_processing_idefics2 import Idefics2ImageProcessorKwargs, convert_to_rgb
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
@ -105,15 +104,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
|
||||
return mask
|
||||
|
||||
|
||||
class Idefics2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
do_image_splitting (`bool`, *optional*, defaults to `False`):
|
||||
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
|
||||
"""
|
||||
|
||||
do_image_splitting: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Idefics2ImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BILINEAR
|
||||
@ -127,7 +117,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast):
|
||||
do_image_splitting = False
|
||||
size = {"shortest_edge": 378, "longest_edge": 980}
|
||||
model_input_names = ["pixel_values", "pixel_attention_mask"]
|
||||
valid_kwargs = Idefics2FastImageProcessorKwargs
|
||||
valid_kwargs = Idefics2ImageProcessorKwargs
|
||||
|
||||
def convert_to_rgb(self, image: ImageInput) -> ImageInput:
|
||||
"""
|
||||
@ -214,7 +204,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast):
|
||||
return image, pixel_mask
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2FastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2ImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
|
@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Optional, Union
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, is_valid_image, load_image
|
||||
from ...processing_utils import (
|
||||
ImagesKwargs,
|
||||
ProcessingKwargs,
|
||||
ProcessorMixin,
|
||||
Unpack,
|
||||
@ -46,20 +45,13 @@ def is_image_or_image_url(elem):
|
||||
return is_url(elem) or is_valid_image(elem)
|
||||
|
||||
|
||||
class Idefics2ImagesKwargs(ImagesKwargs, total=False):
|
||||
image_seq_len: Optional[int]
|
||||
|
||||
|
||||
class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: Idefics2ImagesKwargs
|
||||
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"add_special_tokens": True,
|
||||
"padding": False,
|
||||
"is_split_into_words": False,
|
||||
},
|
||||
"images_kwargs": {},
|
||||
}
|
||||
|
||||
|
||||
@ -123,8 +115,6 @@ class Idefics2Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
|
||||
text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Idefics2ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -181,8 +171,6 @@ class Idefics2Processor(ProcessorMixin):
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
|
||||
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
|
||||
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||
|
||||
n_images_in_text = []
|
||||
@ -197,12 +185,11 @@ class Idefics2Processor(ProcessorMixin):
|
||||
# Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
|
||||
fake_image_token = self.fake_image_token
|
||||
image_token = self.image_token
|
||||
image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
|
||||
image_str = f"{fake_image_token}{image_token * self.image_seq_len}{fake_image_token}"
|
||||
|
||||
if self.image_processor.do_image_splitting:
|
||||
# A single image token is split into 4 patches + 1 original image
|
||||
image_str = image_str * 5
|
||||
image_seq_len *= 5
|
||||
|
||||
prompt_strings = []
|
||||
for sample in text:
|
||||
|
@ -35,6 +35,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, is_vision_available, logging
|
||||
|
||||
|
||||
@ -47,6 +48,22 @@ if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class Idefics3ImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
do_image_splitting (`bool`, *optional*, defaults to `True`):
|
||||
Whether to split the image into sub-images concatenated with the original image. They are split into patches
|
||||
such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
|
||||
max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
|
||||
Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
|
||||
return_row_col_info (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return the row and column information of the images.
|
||||
"""
|
||||
|
||||
do_image_splitting: Optional[bool]
|
||||
max_image_size: Optional[dict[str, int]]
|
||||
return_row_col_info: Optional[bool]
|
||||
|
||||
|
||||
def _resize_output_size_rescale_to_max_len(
|
||||
height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
|
||||
) -> tuple[int, int]:
|
||||
@ -291,6 +308,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_attention_mask"]
|
||||
valid_kwargs = Idefics3ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -22,7 +22,6 @@ import torch
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
SizeDict,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
@ -36,6 +35,7 @@ from ...image_utils import (
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
|
||||
from .image_processing_idefics3 import Idefics3ImageProcessorKwargs
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
@ -169,22 +169,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
|
||||
return mask
|
||||
|
||||
|
||||
class Idefics3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
do_image_splitting (`bool`, *optional*, defaults to `True`):
|
||||
Whether to split the image into sub-images concatenated with the original image. They are split into patches
|
||||
such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
|
||||
max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
|
||||
Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
|
||||
return_row_col_info (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return the row and column information of the images.
|
||||
"""
|
||||
|
||||
do_image_splitting: Optional[bool]
|
||||
max_image_size: Optional[dict[str, int]]
|
||||
return_row_col_info: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Idefics3ImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.LANCZOS
|
||||
@ -199,7 +183,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
|
||||
do_image_splitting = True
|
||||
do_pad = True
|
||||
return_row_col_info = False
|
||||
valid_kwargs = Idefics3FastImageProcessorKwargs
|
||||
valid_kwargs = Idefics3ImageProcessorKwargs
|
||||
|
||||
def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
|
||||
"""
|
||||
@ -367,7 +351,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
|
||||
return image, pixel_mask
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3FastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3ImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
|
@ -24,7 +24,7 @@ import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, is_valid_image, load_image
|
||||
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
|
||||
from ...utils import logging
|
||||
|
||||
@ -87,14 +87,7 @@ def get_image_prompt_string(
|
||||
)
|
||||
|
||||
|
||||
class Idefics3ImagesKwargs(ImagesKwargs, total=False):
|
||||
return_row_col_info: Optional[bool]
|
||||
max_image_size: Optional[dict[str, int]]
|
||||
|
||||
|
||||
class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: Idefics3ImagesKwargs
|
||||
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"add_special_tokens": True,
|
||||
@ -179,8 +172,6 @@ class Idefics3Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
|
||||
text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
image_seq_len: Optional[int] = None,
|
||||
**kwargs: Unpack[Idefics3ProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
|
@ -31,17 +31,34 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, is_vision_available, logging
|
||||
from ...utils.import_utils import requires
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
import PIL
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ImageGPTImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
|
||||
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
|
||||
in `preprocess`.
|
||||
do_color_quantize (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
|
||||
When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
|
||||
"""
|
||||
|
||||
clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
|
||||
do_color_quantize: Optional[bool]
|
||||
|
||||
|
||||
def squared_euclidean_distance(a, b):
|
||||
b = b.T
|
||||
a2 = np.sum(np.square(a), axis=1)
|
||||
@ -83,6 +100,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = ImageGPTImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -23,7 +23,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
)
|
||||
from ...image_transforms import group_images_by_shape, reorder_images
|
||||
from ...image_utils import PILImageResampling
|
||||
@ -32,6 +31,7 @@ from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
)
|
||||
from .image_processing_imagegpt import ImageGPTImageProcessorKwargs
|
||||
|
||||
|
||||
def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
|
||||
@ -68,20 +68,6 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso
|
||||
return torch.argmin(d, dim=1)
|
||||
|
||||
|
||||
class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
|
||||
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
|
||||
in `preprocess`.
|
||||
do_color_quantize (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
|
||||
When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
|
||||
"""
|
||||
|
||||
clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]]
|
||||
do_color_quantize: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class ImageGPTImageProcessorFast(BaseImageProcessorFast):
|
||||
model_input_names = ["input_ids"]
|
||||
@ -92,12 +78,12 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
|
||||
image_std = [0.5, 0.5, 0.5]
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
valid_kwargs = ImageGPTFastImageProcessorKwargs
|
||||
valid_kwargs = ImageGPTImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, # keep as arg for backwards compatibility
|
||||
**kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
|
||||
**kwargs: Unpack[ImageGPTImageProcessorKwargs],
|
||||
):
|
||||
r"""
|
||||
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
|
||||
|
@ -43,7 +43,6 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False):
|
||||
"return_length": False,
|
||||
"verbose": True,
|
||||
},
|
||||
"images_kwargs": {},
|
||||
}
|
||||
|
||||
|
||||
@ -85,8 +84,6 @@ class InstructBlipProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[InstructBlipProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -19,19 +19,12 @@ import numpy as np
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
|
||||
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...video_utils import VideoInput
|
||||
|
||||
|
||||
class InternVLImagesKwargs(ImagesKwargs, total=False):
|
||||
crop_to_patches: Optional[bool]
|
||||
min_patches: Optional[int]
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
class InternVLProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: InternVLImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding_side": "left",
|
||||
@ -159,7 +152,6 @@ class InternVLProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audio=None,
|
||||
videos: Optional[VideoInput] = None,
|
||||
**kwargs: Unpack[InternVLProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
|
@ -43,7 +43,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
|
||||
initial_shift = True
|
||||
do_sample_frames = False # Set to False for BC, recommended to set `True` in new models
|
||||
valid_kwargs = InternVLVideoProcessorInitKwargs
|
||||
model_input_names = ["pixel_values_videos"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
@ -40,6 +40,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
|
||||
|
||||
|
||||
@ -50,6 +51,16 @@ if is_vision_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class JanusImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
|
||||
|
||||
class JanusImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a JANUS image processor.
|
||||
@ -92,6 +103,8 @@ class JanusImageProcessor(BaseImageProcessor):
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
valid_kwargs = JanusImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
|
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -38,16 +37,7 @@ from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
)
|
||||
|
||||
|
||||
class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
from .image_processing_janus import JanusImageProcessorKwargs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -61,9 +51,9 @@ class JanusImageProcessorFast(BaseImageProcessorFast):
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_pad = True
|
||||
valid_kwargs = JanusFastImageProcessorKwargs
|
||||
valid_kwargs = JanusImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[JanusImageProcessorKwargs]):
|
||||
if kwargs.get("image_mean") is None:
|
||||
background_color = (127, 127, 127)
|
||||
else:
|
||||
|
@ -47,7 +47,7 @@ from ...image_utils import (
|
||||
)
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
TransformersKwargs,
|
||||
@ -1289,6 +1289,16 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
|
||||
return generated_tokens
|
||||
|
||||
|
||||
class JanusImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
|
||||
|
||||
class JanusImageProcessor(BlipImageProcessor):
|
||||
r"""
|
||||
Constructs a JANUS image processor.
|
||||
@ -1329,6 +1339,8 @@ class JanusImageProcessor(BlipImageProcessor):
|
||||
Whether to pad the image to square or not.
|
||||
"""
|
||||
|
||||
valid_kwargs = JanusImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
|
@ -81,8 +81,6 @@ class JanusProcessor(ProcessorMixin):
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
images: Optional[ImageInput] = None,
|
||||
videos=None,
|
||||
audio=None,
|
||||
**kwargs: Unpack[JanusProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -136,8 +136,6 @@ class Kosmos2Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, list[TextInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Kosmos2ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
@ -34,6 +34,7 @@ from ...image_utils import (
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, is_torch_available, logging
|
||||
from ...utils.import_utils import requires_backends
|
||||
|
||||
@ -45,6 +46,19 @@ logger = logging.get_logger(__name__)
|
||||
DEFAULT_FONT_PATH = "ybelkada/fonts"
|
||||
|
||||
|
||||
class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
|
||||
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
|
||||
max_patches (`int`, *optional*, defaults to 4096):
|
||||
The maximum number of patches to extract from the image as per the
|
||||
[KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
|
||||
"""
|
||||
|
||||
patch_size: Optional[dict[str, int]]
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
|
||||
def torch_extract_patches(image_tensor, patch_height, patch_width):
|
||||
"""
|
||||
@ -92,6 +106,7 @@ class Kosmos2_5ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["flattened_patches"]
|
||||
valid_kwargs = Kosmos2_5ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -22,13 +22,13 @@ import torch
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import ChannelDimension, ImageInput, get_image_size
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring
|
||||
from .image_processing_kosmos2_5 import Kosmos2_5ImageProcessorKwargs
|
||||
|
||||
|
||||
# Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
|
||||
@ -56,19 +56,6 @@ def torch_extract_patches(image_tensor, patch_height, patch_width):
|
||||
return patches
|
||||
|
||||
|
||||
class Kosmos2_5FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
|
||||
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
|
||||
max_patches (`int`, *optional*, defaults to 4096):
|
||||
The maximum number of patches to extract from the image as per the
|
||||
[KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
|
||||
"""
|
||||
|
||||
patch_size: Optional[dict[str, int]]
|
||||
max_patches: Optional[int]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
|
||||
# To be checked against the slow image processor
|
||||
@ -78,13 +65,13 @@ class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
|
||||
patch_size = {"height": 16, "width": 16}
|
||||
max_patches = 4096
|
||||
rescale_factor = None
|
||||
valid_kwargs = Kosmos2_5FastImageProcessorKwargs
|
||||
valid_kwargs = Kosmos2_5ImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]) -> BatchFeature:
|
||||
r"""
|
||||
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
|
||||
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
|
||||
|
@ -20,7 +20,7 @@ from typing import Optional, Union
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import TextInput
|
||||
from ...utils import is_torch_available
|
||||
|
||||
@ -29,14 +29,7 @@ if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
|
||||
max_patches: Optional[int]
|
||||
num_image_tokens: Optional[int]
|
||||
|
||||
|
||||
class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
text_kwargs: TextKwargs
|
||||
images_kwargs: Kosmos2_5ImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": True,
|
||||
@ -46,7 +39,6 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
},
|
||||
"images_kwargs": {
|
||||
"max_patches": 4096,
|
||||
"num_image_tokens": 2048,
|
||||
},
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
}
|
||||
@ -65,24 +57,25 @@ class Kosmos2_5Processor(ProcessorMixin):
|
||||
An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
|
||||
tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
|
||||
An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
|
||||
num_image_tokens (`int`, *optional*, defaults to 2048):
|
||||
Number of image tokens used as a placeholder.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "PreTrainedTokenizerFast"
|
||||
|
||||
def __init__(self, image_processor, tokenizer):
|
||||
def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
|
||||
self.image_start_token = tokenizer.boi_token # "<image>" : fixed token for the start of image
|
||||
self.image_end_token = tokenizer.eoi_token # "</image>" : fixed token for the end of image
|
||||
self.image_token = tokenizer.image_token # "<s>" : within a <image> ... </image> pair, these <s> tokens indicate they are positions reserved for an image
|
||||
self.num_image_tokens = num_image_tokens
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, list[TextInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Kosmos2_5ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -104,8 +97,6 @@ class Kosmos2_5Processor(ProcessorMixin):
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)
|
||||
|
||||
encoding = BatchFeature()
|
||||
|
||||
if images is not None:
|
||||
@ -114,7 +105,7 @@ class Kosmos2_5Processor(ProcessorMixin):
|
||||
image_encoding.pop("cols")
|
||||
encoding.update(image_encoding)
|
||||
|
||||
prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * num_image_tokens}{self.image_end_token}"
|
||||
prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * self.num_image_tokens}{self.image_end_token}"
|
||||
|
||||
if text is not None:
|
||||
if isinstance(text, str):
|
||||
@ -124,7 +115,7 @@ class Kosmos2_5Processor(ProcessorMixin):
|
||||
input = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
|
||||
batch_size, seq_len = input.input_ids.shape
|
||||
image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1]
|
||||
image_embeds_position_mask = [0, -1] + [1] * self.num_image_tokens + [-1]
|
||||
image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask))
|
||||
image_embeds_position_mask = (
|
||||
torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
|
||||
|
@ -30,6 +30,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
filter_out_non_signature_kwargs,
|
||||
@ -51,6 +52,25 @@ if is_pytesseract_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
apply_ocr (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
||||
the `apply_ocr` parameter in the `preprocess` method.
|
||||
ocr_lang (`str`, *optional*):
|
||||
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
|
||||
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
|
||||
tesseract_config (`str`, *optional*):
|
||||
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
|
||||
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
|
||||
`preprocess` method.
|
||||
"""
|
||||
|
||||
apply_ocr: Optional[bool]
|
||||
ocr_lang: Optional[str]
|
||||
tesseract_config: Optional[str]
|
||||
|
||||
|
||||
def normalize_box(box, width, height):
|
||||
return [
|
||||
int(1000 * (box[0] / width)),
|
||||
@ -125,6 +145,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = LayoutLMv2ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -19,7 +19,7 @@ from typing import Optional, Union
|
||||
import torch
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
|
||||
from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
|
||||
from ...image_utils import ImageInput, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
@ -29,32 +29,12 @@ from ...utils import (
|
||||
logging,
|
||||
requires_backends,
|
||||
)
|
||||
from .image_processing_layoutlmv2 import apply_tesseract
|
||||
from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessorKwargs, apply_tesseract
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LayoutLMv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
Args:
|
||||
apply_ocr (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
||||
the `apply_ocr` parameter in the `preprocess` method.
|
||||
ocr_lang (`str`, *optional*):
|
||||
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
|
||||
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
|
||||
tesseract_config (`str`, *optional*):
|
||||
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
|
||||
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
|
||||
`preprocess` method.
|
||||
"""
|
||||
|
||||
apply_ocr: Optional[bool]
|
||||
ocr_lang: Optional[str]
|
||||
tesseract_config: Optional[str]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BILINEAR
|
||||
@ -64,13 +44,13 @@ class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
|
||||
apply_ocr = True
|
||||
ocr_lang = None
|
||||
tesseract_config = ""
|
||||
valid_kwargs = LayoutLMv2FastImageProcessorKwargs
|
||||
valid_kwargs = LayoutLMv2ImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
|
@ -34,6 +34,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
filter_out_non_signature_kwargs,
|
||||
@ -55,6 +56,25 @@ if is_pytesseract_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
apply_ocr (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
||||
the `apply_ocr` parameter in the `preprocess` method.
|
||||
ocr_lang (`str`, *optional*):
|
||||
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
|
||||
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
|
||||
tesseract_config (`str`, *optional*):
|
||||
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
|
||||
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
|
||||
`preprocess` method.
|
||||
"""
|
||||
|
||||
apply_ocr: Optional[bool]
|
||||
ocr_lang: Optional[str]
|
||||
tesseract_config: Optional[str]
|
||||
|
||||
|
||||
def normalize_box(box, width, height):
|
||||
return [
|
||||
int(1000 * (box[0] / width)),
|
||||
@ -143,6 +163,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = LayoutLMv3ImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -19,7 +19,7 @@ from typing import Optional, Union
|
||||
import torch
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
|
||||
from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
|
||||
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
@ -29,32 +29,12 @@ from ...utils import (
|
||||
logging,
|
||||
requires_backends,
|
||||
)
|
||||
from .image_processing_layoutlmv3 import apply_tesseract
|
||||
from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessorKwargs, apply_tesseract
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LayoutLMv3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
Args:
|
||||
apply_ocr (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
||||
the `apply_ocr` parameter in the `preprocess` method.
|
||||
ocr_lang (`str`, *optional*):
|
||||
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
|
||||
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
|
||||
tesseract_config (`str`, *optional*):
|
||||
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
|
||||
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
|
||||
`preprocess` method.
|
||||
"""
|
||||
|
||||
apply_ocr: Optional[bool]
|
||||
ocr_lang: Optional[str]
|
||||
tesseract_config: Optional[str]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BILINEAR
|
||||
@ -67,13 +47,13 @@ class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
|
||||
apply_ocr = True
|
||||
ocr_lang = None
|
||||
tesseract_config = ""
|
||||
valid_kwargs = LayoutLMv3FastImageProcessorKwargs
|
||||
valid_kwargs = LayoutLMv3ImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]) -> BatchFeature:
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
|
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
@ -33,9 +32,7 @@ from ...image_utils import (
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
)
|
||||
from ...processing_utils import (
|
||||
Unpack,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
@ -172,7 +169,7 @@ def pad_along_first_dim(
|
||||
return images, pixel_mask
|
||||
|
||||
|
||||
class Lfm2VlFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
class Lfm2VlImageProcessorKwargs(ImagesKwargs):
|
||||
"""
|
||||
downsample_factor (`int`, *optional*, defaults to `2`):
|
||||
The downsampling factor for images used when resizing the image.
|
||||
@ -214,10 +211,10 @@ class Lfm2VlImageProcessorFast(BaseImageProcessorFast):
|
||||
return_row_col_info = False
|
||||
image_mean = IMAGENET_STANDARD_STD
|
||||
image_std = IMAGENET_STANDARD_MEAN
|
||||
valid_kwargs = Lfm2VlFastImageProcessorKwargs
|
||||
valid_kwargs = Lfm2VlImageProcessorKwargs
|
||||
model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Lfm2VlFastImageProcessorKwargs]):
|
||||
def __init__(self, **kwargs: Unpack[Lfm2VlImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
max_thumbnail_image_patches = self.max_image_tokens * self.downsample_factor**2
|
||||
|
@ -25,12 +25,11 @@ from torchvision.transforms.v2 import functional as F
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import ImageInput, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
@ -309,8 +308,8 @@ def get_best_fit(
|
||||
return optimal_canvas
|
||||
|
||||
|
||||
class Llama4ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
class Llama4ImageProcessorKwargs(ImagesKwargs):
|
||||
r"""
|
||||
max_patches (`int`, *optional*, defaults to 16):
|
||||
The maximum number of patches to be extracted from the image.
|
||||
Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||
|
@ -16,20 +16,14 @@
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, make_flat_list_of_images
|
||||
|
||||
|
||||
class Llama4ImagesKwargs(ImagesKwargs, total=False):
|
||||
max_patches: Optional[int]
|
||||
resize_to_max_canvas: Optional[bool]
|
||||
|
||||
|
||||
class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: Llama4ImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding_side": "left",
|
||||
@ -139,8 +133,6 @@ class Llama4Processor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Llama4ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user