Validate processing kwargs with @strict from huggingface_hub (#40793)

* initial design draft

* delete

* fix a few tests

* fix

* fix the rest of tests

* common-kwargs

* why the runner complains about typing with "|"?

* revert

* forgot to delete

* update

* fix last issues

* add more detalis in docs

* pin the latest hub release

* fix tests for new models

* also fast image processor

* fix copies

* image processing ast validated

* fix more tests

* typo.and fix copies

* bump

* style

* fix some tests

* fix copies

* pin rc4 and mark all TypedDict as non-total

* delete typed dict adaptor

* address comments

* delete optionals
This commit is contained in:
Raushan Turganbay
2025-10-08 16:14:09 +02:00
committed by GitHub
parent 82ffeb28ad
commit 89a4115a6b
113 changed files with 647 additions and 456 deletions

View File

@ -114,7 +114,7 @@ _deps = [
"GitPython<3.1.19", "GitPython<3.1.19",
"hf-doc-builder>=0.3.0", "hf-doc-builder>=0.3.0",
"hf_xet", "hf_xet",
"huggingface-hub==1.0.0.rc2", "huggingface-hub==1.0.0.rc4",
"importlib_metadata", "importlib_metadata",
"ipadic>=1.0.0,<2.0", "ipadic>=1.0.0,<2.0",
"jinja2>=3.1.0", "jinja2>=3.1.0",

View File

@ -23,7 +23,7 @@ deps = {
"GitPython": "GitPython<3.1.19", "GitPython": "GitPython<3.1.19",
"hf-doc-builder": "hf-doc-builder>=0.3.0", "hf-doc-builder": "hf-doc-builder>=0.3.0",
"hf_xet": "hf_xet", "hf_xet": "hf_xet",
"huggingface-hub": "huggingface-hub==1.0.0.rc2", "huggingface-hub": "huggingface-hub==1.0.0.rc4",
"importlib_metadata": "importlib_metadata", "importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0", "ipadic": "ipadic>=1.0.0,<2.0",
"jinja2": "jinja2>=3.1.0", "jinja2": "jinja2>=3.1.0",

View File

@ -18,6 +18,7 @@ from functools import lru_cache, partial
from typing import Any, Optional, Union from typing import Any, Optional, Union
import numpy as np import numpy as np
from huggingface_hub.dataclasses import validate_typed_dict
from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from .image_transforms import ( from .image_transforms import (
@ -710,6 +711,10 @@ class BaseImageProcessorFast(BaseImageProcessor):
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature: def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names) validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
# Perform type validation on received kwargs
validate_typed_dict(self.valid_kwargs, kwargs)
# Set default kwargs from self. This ensures that if a kwarg is not provided # Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None. # by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self._valid_kwargs_names: for kwarg_name in self._valid_kwargs_names:

View File

@ -38,7 +38,7 @@ from ...image_utils import (
) )
from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_utils import PreTrainedModel from ...modeling_utils import PreTrainedModel
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils import PreTokenizedInput, TextInput from ...tokenization_utils import PreTokenizedInput, TextInput
from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
@ -904,7 +904,15 @@ class AriaImageProcessor(BaseImageProcessor):
return num_patches return num_patches
class AriaImagesKwargs(ImagesKwargs, total=False):
split_image: bool
max_image_size: int
min_image_size: int
class AriaProcessorKwargs(ProcessingKwargs, total=False): class AriaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: AriaImagesKwargs
_defaults = { _defaults = {
"text_kwargs": { "text_kwargs": {
"padding": False, "padding": False,

View File

@ -24,13 +24,21 @@ import numpy as np
from ...image_processing_utils import BatchFeature from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput from ...image_utils import ImageInput
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils import PreTokenizedInput, TextInput from ...tokenization_utils import PreTokenizedInput, TextInput
from ...utils import TensorType from ...utils import TensorType
from ..auto import AutoTokenizer from ..auto import AutoTokenizer
class AriaImagesKwargs(ImagesKwargs, total=False):
split_image: bool
max_image_size: int
min_image_size: int
class AriaProcessorKwargs(ProcessingKwargs, total=False): class AriaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: AriaImagesKwargs
_defaults = { _defaults = {
"text_kwargs": { "text_kwargs": {
"padding": False, "padding": False,

View File

@ -55,7 +55,7 @@ if is_torch_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class BeitImageProcessorKwargs(ImagesKwargs): class BeitImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
@ -63,7 +63,7 @@ class BeitImageProcessorKwargs(ImagesKwargs):
ADE20k). The background label will be replaced by 255. ADE20k). The background label will be replaced by 255.
""" """
do_reduce_labels: Optional[bool] do_reduce_labels: bool
@requires(backends=("vision",)) @requires(backends=("vision",))

View File

@ -123,8 +123,8 @@ def get_resize_output_image_size(
return new_height, new_width return new_height, new_width
class BridgeTowerImageProcessorKwargs(ImagesKwargs): class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
size_divisor: Optional[int] size_divisor: int
class BridgeTowerImageProcessor(BaseImageProcessor): class BridgeTowerImageProcessor(BaseImageProcessor):

View File

@ -33,7 +33,7 @@ from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring from ...utils import TensorType, auto_docstring
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs): class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
""" """
crop_to_patches (`bool`, *optional*, defaults to `False`): crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
@ -46,9 +46,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
""" """
crop_to_patches: Optional[bool] crop_to_patches: bool
min_patches: Optional[int] min_patches: int
max_patches: Optional[int] max_patches: int
@lru_cache(maxsize=10) @lru_cache(maxsize=10)

View File

@ -303,7 +303,7 @@ def get_optimal_tiled_canvas(
return best_grid return best_grid
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs): class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
""" """
crop_to_patches (`bool`, *optional*, defaults to `False`): crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
@ -316,9 +316,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
""" """
crop_to_patches: Optional[bool] crop_to_patches: bool
min_patches: Optional[int] min_patches: int
max_patches: Optional[int] max_patches: int
@auto_docstring @auto_docstring

View File

@ -729,7 +729,7 @@ def compute_segments(
return segmentation, segments return segmentation, segments
class ConditionalDetrImageProcessorKwargs(ImagesKwargs): class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic". Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@ -745,9 +745,9 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks. Path to the directory containing the segmentation masks.
""" """
format: Optional[Union[str, AnnotationFormat]] format: Union[str, AnnotationFormat]
do_convert_annotations: Optional[bool] do_convert_annotations: bool
return_segmentation_masks: Optional[bool] return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]] masks_path: Optional[Union[str, pathlib.Path]]

View File

@ -50,14 +50,14 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class ConvNextImageProcessorKwargs(ImagesKwargs): class ConvNextImageProcessorKwargs(ImagesKwargs, total=False):
""" """
crop_pct (`float`, *optional*): crop_pct (`float`, *optional*):
Percentage of the image to crop. Only has an effect if size < 384. Can be Percentage of the image to crop. Only has an effect if size < 384. Can be
overridden by `crop_pct` in the`preprocess` method. overridden by `crop_pct` in the`preprocess` method.
""" """
crop_pct: Optional[float] crop_pct: float
@requires(backends=("vision",)) @requires(backends=("vision",))

View File

@ -49,7 +49,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class DeepseekVLImageProcessorKwargs(ImagesKwargs): class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
min_size (`int`, *optional*, defaults to 14): min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width The minimum allowed size for the resized image. Ensures that neither the height nor width

View File

@ -50,7 +50,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
min_size (`int`, *optional*, defaults to 14): min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width The minimum allowed size for the resized image. Ensures that neither the height nor width
@ -71,9 +71,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
min_size: int min_size: int
high_res_size: dict high_res_size: dict
high_res_resample: "PILImageResampling" high_res_resample: Union["PILImageResampling", int]
high_res_image_mean: list[float] high_res_image_mean: Union[float, list[float], tuple[float, ...]]
high_res_image_std: list[float] high_res_image_std: Union[float, list[float], tuple[float, ...]]
class DeepseekVLHybridImageProcessor(BaseImageProcessor): class DeepseekVLHybridImageProcessor(BaseImageProcessor):

View File

@ -429,7 +429,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
return model_inputs return model_inputs
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
min_size (`int`, *optional*, defaults to 14): min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width The minimum allowed size for the resized image. Ensures that neither the height nor width
@ -450,9 +450,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
min_size: int min_size: int
high_res_size: dict high_res_size: dict
high_res_resample: "PILImageResampling" high_res_resample: Union["PILImageResampling", int]
high_res_image_mean: list[float] high_res_image_mean: Union[float, list[float], tuple[float, ...]]
high_res_image_std: list[float] high_res_image_std: Union[float, list[float], tuple[float, ...]]
class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor): class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):

View File

@ -82,7 +82,7 @@ if is_scipy_available():
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
class DeformableDetrImageProcessorKwargs(ImagesKwargs): class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic". Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@ -98,9 +98,9 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks. Path to the directory containing the segmentation masks.
""" """
format: Optional[Union[str, AnnotationFormat]] format: Union[str, AnnotationFormat]
do_convert_annotations: Optional[bool] do_convert_annotations: bool
return_segmentation_masks: Optional[bool] return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]] masks_path: Optional[Union[str, pathlib.Path]]

View File

@ -84,7 +84,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
class DetrImageProcessorKwargs(ImagesKwargs): class DetrImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic". Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@ -100,9 +100,9 @@ class DetrImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks. Path to the directory containing the segmentation masks.
""" """
format: Optional[Union[str, AnnotationFormat]] format: Union[str, AnnotationFormat]
do_convert_annotations: Optional[bool] do_convert_annotations: bool
return_segmentation_masks: Optional[bool] return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]] masks_path: Optional[Union[str, pathlib.Path]]

View File

@ -55,7 +55,9 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
"generation": True, "generation": True,
"sampling_rate": 44100, "sampling_rate": 44100,
}, },
"common_kwargs": {"return_tensors": "pt"}, "common_kwargs": {
"return_tensors": "pt",
},
} }

View File

@ -52,7 +52,7 @@ if is_vision_available():
import PIL import PIL
class DonutImageProcessorKwargs(ImagesKwargs): class DonutImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`): do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
Whether to resize the image using thumbnail method. Whether to resize the image using thumbnail method.
@ -60,8 +60,8 @@ class DonutImageProcessorKwargs(ImagesKwargs):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
""" """
do_thumbnail: Optional[bool] do_thumbnail: bool
do_align_long_axis: Optional[bool] do_align_long_axis: bool
@requires(backends=("vision",)) @requires(backends=("vision",))

View File

@ -64,7 +64,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class DPTImageProcessorKwargs(ImagesKwargs): class DPTImageProcessorKwargs(ImagesKwargs, total=False):
""" """
ensure_multiple_of (`int`, *optional*, defaults to 1): ensure_multiple_of (`int`, *optional*, defaults to 1):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
@ -78,10 +78,10 @@ class DPTImageProcessorKwargs(ImagesKwargs):
ADE20k). The background label will be replaced by 255. ADE20k). The background label will be replaced by 255.
""" """
ensure_multiple_of: Optional[int] ensure_multiple_of: int
size_divisor: Optional[int] size_divisor: int
keep_aspect_ratio: Optional[bool] keep_aspect_ratio: bool
do_reduce_labels: Optional[bool] do_reduce_labels: bool
def get_resize_output_image_size( def get_resize_output_image_size(

View File

@ -50,13 +50,13 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class EfficientLoFTRImageProcessorKwargs(ImagesKwargs): class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
do_grayscale (`bool`, *optional*, defaults to `True`): do_grayscale (`bool`, *optional*, defaults to `True`):
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
""" """
do_grayscale: Optional[bool] = True do_grayscale: bool
# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale # Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale

View File

@ -44,7 +44,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class EfficientNetImageProcessorKwargs(ImagesKwargs): class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False):
""" """
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`): rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range]. Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].

View File

@ -47,9 +47,9 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Emu3ImageProcessorKwargs(ImagesKwargs): class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
ratio: Optional[str] ratio: str
image_area: Optional[int] image_area: int
def smart_resize( def smart_resize(

View File

@ -55,7 +55,7 @@ if is_torch_available():
import torch.nn.functional as F import torch.nn.functional as F
class EomtImageProcessorKwargs(ImagesKwargs): class EomtImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_split_image (`bool`, *optional*, defaults to `False`): do_split_image (`bool`, *optional*, defaults to `False`):
Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
@ -67,7 +67,7 @@ class EomtImageProcessorKwargs(ImagesKwargs):
""" """
do_split_image: bool do_split_image: bool
ignore_index: Optional[int] = None ignore_index: Optional[int]
# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks # Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks

View File

@ -57,7 +57,7 @@ FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
LOGIT_LAPLACE_EPS: float = 0.1 LOGIT_LAPLACE_EPS: float = 0.1
class FlavaImageProcessorKwargs(ImagesKwargs): class FlavaImageProcessorKwargs(ImagesKwargs, total=False):
""" """
return_image_mask (`bool`, *optional*, defaults to `False`): return_image_mask (`bool`, *optional*, defaults to `False`):
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`. Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
@ -118,26 +118,26 @@ class FlavaImageProcessorKwargs(ImagesKwargs):
""" """
# Mask related params # Mask related params
return_image_mask: Optional[bool] return_image_mask: bool
input_size_patches: Optional[int] input_size_patches: int
total_mask_patches: Optional[int] total_mask_patches: int
mask_group_min_patches: Optional[int] mask_group_min_patches: int
mask_group_max_patches: Optional[int] mask_group_max_patches: int
mask_group_min_aspect_ratio: Optional[float] mask_group_min_aspect_ratio: float
mask_group_max_aspect_ratio: Optional[float] mask_group_max_aspect_ratio: float
# Codebook related params # Codebook related params
return_codebook_pixels: Optional[bool] return_codebook_pixels: bool
codebook_do_resize: Optional[bool] codebook_do_resize: bool
codebook_size: Optional[bool] codebook_size: bool
codebook_resample: Optional[int] codebook_resample: int
codebook_do_center_crop: Optional[bool] codebook_do_center_crop: bool
codebook_crop_size: Optional[int] codebook_crop_size: int
codebook_do_rescale: Optional[bool] codebook_do_rescale: bool
codebook_rescale_factor: Optional[Union[int, float]] codebook_rescale_factor: Union[int, float]
codebook_do_map_pixels: Optional[bool] codebook_do_map_pixels: bool
codebook_do_normalize: Optional[bool] codebook_do_normalize: bool
codebook_image_mean: Optional[Union[float, Iterable[float]]] codebook_image_mean: Union[float, Iterable[float]]
codebook_image_std: Optional[Union[float, Iterable[float]]] codebook_image_std: Union[float, Iterable[float]]
# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py # Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py

View File

@ -51,7 +51,7 @@ if is_vision_available():
import PIL import PIL
class Gemma3ImageProcessorKwargs(ImagesKwargs): class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_pan_and_scan (`bool`, *optional*): do_pan_and_scan (`bool`, *optional*):
Whether to apply `pan_and_scan` to images. Whether to apply `pan_and_scan` to images.
@ -63,10 +63,10 @@ class Gemma3ImageProcessorKwargs(ImagesKwargs):
Minimum aspect ratio to activate pan and scan. Minimum aspect ratio to activate pan and scan.
""" """
do_pan_and_scan: Optional[bool] do_pan_and_scan: bool
pan_and_scan_min_crop_size: Optional[int] pan_and_scan_min_crop_size: int
pan_and_scan_max_num_crops: Optional[int] pan_and_scan_max_num_crops: int
pan_and_scan_min_ratio_to_activate: Optional[float] pan_and_scan_min_ratio_to_activate: float
class Gemma3ImageProcessor(BaseImageProcessor): class Gemma3ImageProcessor(BaseImageProcessor):

View File

@ -47,7 +47,7 @@ from ...video_utils import VideoInput
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Glm4vImageProcessorKwargs(ImagesKwargs): class Glm4vImageProcessorKwargs(ImagesKwargs, total=False):
""" """
patch_size (`int`, *optional*, defaults to 14): patch_size (`int`, *optional*, defaults to 14):
The spatial patch size of the vision encoder. The spatial patch size of the vision encoder.
@ -57,9 +57,9 @@ class Glm4vImageProcessorKwargs(ImagesKwargs):
The merge size of the vision encoder to llm encoder. The merge size of the vision encoder to llm encoder.
""" """
patch_size: Optional[int] patch_size: int
temporal_patch_size: Optional[int] temporal_patch_size: int
merge_size: Optional[int] merge_size: int
def smart_resize( def smart_resize(

View File

@ -36,12 +36,12 @@ from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
from .image_processing_glm4v import smart_resize from .image_processing_glm4v import smart_resize
class Glm4vVideoProcessorInitKwargs(VideosKwargs): class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False):
max_image_size: Optional[dict[str, int]] max_image_size: dict[str, int]
patch_size: Optional[int] patch_size: int
temporal_patch_size: Optional[int] temporal_patch_size: int
merge_size: Optional[int] merge_size: int
max_duration: Optional[int] max_duration: int
@add_start_docstrings( @add_start_docstrings(

View File

@ -49,7 +49,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class GotOcr2ImageProcessorKwargs(ImagesKwargs): class GotOcr2ImageProcessorKwargs(ImagesKwargs, total=False):
""" """
crop_to_patches (`bool`, *optional*, defaults to `False`): crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
@ -62,9 +62,9 @@ class GotOcr2ImageProcessorKwargs(ImagesKwargs):
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
""" """
crop_to_patches: Optional[bool] crop_to_patches: bool
min_patches: Optional[int] min_patches: int
max_patches: Optional[int] max_patches: int
# Similar to image_processing_mllama.get_all_supported_aspect_ratios # Similar to image_processing_mllama.get_all_supported_aspect_ratios

View File

@ -36,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False):
class GotOcr2ImagesKwargs(ImagesKwargs, total=False): class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
crop_to_patches: Optional[bool] crop_to_patches: bool
min_patches: Optional[int] min_patches: int
max_patches: Optional[int] max_patches: int
box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]] box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
color: Optional[str] color: Optional[str]
num_image_tokens: Optional[int] num_image_tokens: int
multi_page: Optional[bool] multi_page: bool
class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):

View File

@ -93,7 +93,7 @@ class AnnotationFormat(ExplicitEnum):
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
class GroundingDinoImageProcessorKwargs(ImagesKwargs): class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic". Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@ -109,9 +109,9 @@ class GroundingDinoImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks. Path to the directory containing the segmentation masks.
""" """
format: Optional[Union[str, AnnotationFormat]] format: Union[str, AnnotationFormat]
do_convert_annotations: Optional[bool] do_convert_annotations: bool
return_segmentation_masks: Optional[bool] return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]] masks_path: Optional[Union[str, pathlib.Path]]

View File

@ -36,7 +36,7 @@ IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711] IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
class IdeficsImageProcessorKwargs(ImagesKwargs): class IdeficsImageProcessorKwargs(ImagesKwargs, total=False):
""" """
transform (`Callable`, *optional*): transform (`Callable`, *optional*):
A custom transform function that accepts a single image can be passed for training. For example, A custom transform function that accepts a single image can be passed for training. For example,
@ -47,7 +47,7 @@ class IdeficsImageProcessorKwargs(ImagesKwargs):
""" """
transform: Optional[Callable] transform: Optional[Callable]
image_size: Optional[dict[str, int]] image_size: dict[str, int]
def convert_to_rgb(image): def convert_to_rgb(image):

View File

@ -47,13 +47,13 @@ if is_vision_available():
from PIL import Image from PIL import Image
class Idefics2ImageProcessorKwargs(ImagesKwargs): class Idefics2ImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_image_splitting (`bool`, *optional*, defaults to `False`): do_image_splitting (`bool`, *optional*, defaults to `False`):
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
""" """
do_image_splitting: Optional[bool] do_image_splitting: bool
def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]: def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:

View File

@ -48,7 +48,7 @@ if is_vision_available():
from PIL import Image from PIL import Image
class Idefics3ImageProcessorKwargs(ImagesKwargs): class Idefics3ImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_image_splitting (`bool`, *optional*, defaults to `True`): do_image_splitting (`bool`, *optional*, defaults to `True`):
Whether to split the image into sub-images concatenated with the original image. They are split into patches Whether to split the image into sub-images concatenated with the original image. They are split into patches
@ -59,9 +59,9 @@ class Idefics3ImageProcessorKwargs(ImagesKwargs):
Whether to return the row and column information of the images. Whether to return the row and column information of the images.
""" """
do_image_splitting: Optional[bool] do_image_splitting: bool
max_image_size: Optional[dict[str, int]] max_image_size: dict[str, int]
return_row_col_info: Optional[bool] return_row_col_info: bool
def _resize_output_size_rescale_to_max_len( def _resize_output_size_rescale_to_max_len(

View File

@ -45,7 +45,7 @@ if is_torch_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class ImageGPTImageProcessorKwargs(ImagesKwargs): class ImageGPTImageProcessorKwargs(ImagesKwargs, total=False):
""" """
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
@ -56,7 +56,7 @@ class ImageGPTImageProcessorKwargs(ImagesKwargs):
""" """
clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]] clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
do_color_quantize: Optional[bool] do_color_quantize: bool
def squared_euclidean_distance(a, b): def squared_euclidean_distance(a, b):

View File

@ -24,15 +24,11 @@ from torchvision.transforms.v2 import functional as F
from ...image_processing_utils import BatchFeature from ...image_processing_utils import BatchFeature
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
from ...processing_utils import Unpack, VideosKwargs
from ...utils import TensorType from ...utils import TensorType
from ...video_processing_utils import BaseVideoProcessor from ...video_processing_utils import BaseVideoProcessor
from ...video_utils import group_videos_by_shape, reorder_videos from ...video_utils import group_videos_by_shape, reorder_videos
class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs): ...
class InstructBlipVideoVideoProcessor(BaseVideoProcessor): class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
resample = PILImageResampling.BICUBIC resample = PILImageResampling.BICUBIC
image_mean = OPENAI_CLIP_MEAN image_mean = OPENAI_CLIP_MEAN
@ -44,12 +40,8 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
do_normalize = True do_normalize = True
do_convert_rgb = True do_convert_rgb = True
do_sample_frames = False # Set to False for BC, recommended to set `True` in new models do_sample_frames = False # Set to False for BC, recommended to set `True` in new models
valid_kwargs = InstructBlipVideoVideoProcessorInitKwargs
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
def __init__(self, **kwargs: Unpack[InstructBlipVideoVideoProcessorInitKwargs]):
super().__init__(**kwargs)
def _preprocess( def _preprocess(
self, self,
videos: list["torch.Tensor"], videos: list["torch.Tensor"],

View File

@ -27,7 +27,7 @@ from ...video_processing_utils import BaseVideoProcessor
from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
class InternVLVideoProcessorInitKwargs(VideosKwargs): class InternVLVideoProcessorInitKwargs(VideosKwargs, total=False):
initial_shift: Union[bool, float, int] initial_shift: Union[bool, float, int]

View File

@ -51,7 +51,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class JanusImageProcessorKwargs(ImagesKwargs): class JanusImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
min_size (`int`, *optional*, defaults to 14): min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width The minimum allowed size for the resized image. Ensures that neither the height nor width

View File

@ -1289,7 +1289,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
return generated_tokens return generated_tokens
class JanusImageProcessorKwargs(ImagesKwargs): class JanusImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
min_size (`int`, *optional*, defaults to 14): min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width The minimum allowed size for the resized image. Ensures that neither the height nor width

View File

@ -33,15 +33,17 @@ BboxInput = Union[
list[list[tuple[float, float, float]]], list[list[tuple[float, float, float]]],
] ]
NestedList = list[Union[Optional[int], "NestedList"]]
class Kosmos2ImagesKwargs(ImagesKwargs, total=False): class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
bboxes: Optional[list[float]] bboxes: Optional[NestedList] # NOTE: hub validators can't accept `Sequence`
num_image_tokens: Optional[int] num_image_tokens: int
first_image_token_id: Optional[int] first_image_token_id: Optional[int]
class Kosmos2TextKwargs(TextKwargs, total=False): class Kosmos2TextKwargs(TextKwargs, total=False):
add_eos_token: Optional[bool] add_eos_token: bool
class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False): class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False):

View File

@ -46,7 +46,7 @@ logger = logging.get_logger(__name__)
DEFAULT_FONT_PATH = "ybelkada/fonts" DEFAULT_FONT_PATH = "ybelkada/fonts"
class Kosmos2_5ImageProcessorKwargs(ImagesKwargs): class Kosmos2_5ImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`): patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16. The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
@ -55,8 +55,8 @@ class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
[KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419). [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
""" """
patch_size: Optional[dict[str, int]] patch_size: dict[str, int]
max_patches: Optional[int] max_patches: int
# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches # Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches

View File

@ -52,7 +52,7 @@ if is_pytesseract_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class LayoutLMv2ImageProcessorKwargs(ImagesKwargs): class LayoutLMv2ImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
apply_ocr (`bool`, *optional*, defaults to `True`): apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
@ -66,7 +66,7 @@ class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
`preprocess` method. `preprocess` method.
""" """
apply_ocr: Optional[bool] apply_ocr: bool
ocr_lang: Optional[str] ocr_lang: Optional[str]
tesseract_config: Optional[str] tesseract_config: Optional[str]

View File

@ -56,7 +56,7 @@ if is_pytesseract_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class LayoutLMv3ImageProcessorKwargs(ImagesKwargs): class LayoutLMv3ImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
apply_ocr (`bool`, *optional*, defaults to `True`): apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
@ -70,7 +70,7 @@ class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
`preprocess` method. `preprocess` method.
""" """
apply_ocr: Optional[bool] apply_ocr: bool
ocr_lang: Optional[str] ocr_lang: Optional[str]
tesseract_config: Optional[str] tesseract_config: Optional[str]

View File

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
import math import math
from functools import lru_cache from functools import lru_cache
from typing import Optional, Union from typing import Union
import torch import torch
from torchvision.transforms.v2 import functional as F from torchvision.transforms.v2 import functional as F
@ -169,24 +169,24 @@ def pad_along_first_dim(
return images, pixel_mask return images, pixel_mask
class Lfm2VlImageProcessorKwargs(ImagesKwargs): class Lfm2VlImageProcessorKwargs(ImagesKwargs, total=False):
""" """
downsample_factor (`int`, *optional*, defaults to `2`): downsample_factor (`int`, *optional*, defaults to `2`):
The downsampling factor for images used when resizing the image. The downsampling factor for images used when resizing the image.
""" """
downsample_factor: Optional[int] downsample_factor: int
do_image_splitting: Optional[bool] do_image_splitting: bool
min_tiles: Optional[int] min_tiles: int
max_tiles: Optional[int] max_tiles: int
use_thumbnail: Optional[bool] use_thumbnail: bool
min_image_tokens: Optional[int] min_image_tokens: int
max_image_tokens: Optional[int] max_image_tokens: int
encoder_patch_size: Optional[int] encoder_patch_size: int
tile_size: Optional[int] tile_size: int
max_pixels_tolerance: Optional[float] max_pixels_tolerance: float
do_pad: Optional[bool] do_pad: bool
return_row_col_info: Optional[bool] return_row_col_info: bool
@auto_docstring @auto_docstring

View File

@ -18,9 +18,9 @@ from typing import Optional, Union
from ...feature_extraction_utils import BatchFeature from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, make_nested_list_of_images from ...image_utils import ImageInput, make_nested_list_of_images
from ...processing_utils import ( from ...processing_utils import (
ImagesKwargs,
ProcessingKwargs, ProcessingKwargs,
ProcessorMixin, ProcessorMixin,
TextKwargs,
Unpack, Unpack,
) )
from ...tokenization_utils_base import BatchEncoding, TextInput from ...tokenization_utils_base import BatchEncoding, TextInput
@ -30,25 +30,12 @@ from ...utils import logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Lfm2VlImagesKwargs(ImagesKwargs, total=False): class Lfm2VlTextKwargs(TextKwargs, total=False):
downsample_factor: Optional[int] use_image_special_tokens: Optional[bool]
do_image_splitting: Optional[bool]
min_tiles: Optional[int]
max_tiles: Optional[int]
use_thumbnail: Optional[bool]
min_image_tokens: Optional[int]
max_image_tokens: Optional[int]
encoder_patch_size: Optional[int]
tile_size: Optional[int]
max_pixels_tolerance: Optional[float]
patch_size: Optional[int]
do_pad: Optional[bool]
return_row_col_info: Optional[bool]
class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False): class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Lfm2VlImagesKwargs text_kwargs: Lfm2VlTextKwargs
_defaults = { _defaults = {
"images_kwargs": { "images_kwargs": {
"return_row_col_info": True, "return_row_col_info": True,
@ -75,8 +62,6 @@ class Lfm2VlProcessor(ProcessorMixin):
An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
chat_template (`str`, *optional*): chat_template (`str`, *optional*):
A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
use_image_special_tokens (`bool`, *optional*, defaults to `True`):
Whether to use image special tokens or not when processing.
""" """
attributes = ["image_processor", "tokenizer"] attributes = ["image_processor", "tokenizer"]
@ -88,12 +73,10 @@ class Lfm2VlProcessor(ProcessorMixin):
image_processor, image_processor,
tokenizer, tokenizer,
chat_template: Optional[str] = None, chat_template: Optional[str] = None,
use_image_special_tokens: Optional[bool] = True,
**kwargs, **kwargs,
): ):
self.image_token = tokenizer.image_token self.image_token = tokenizer.image_token
self.image_token_id = tokenizer.image_token_id self.image_token_id = tokenizer.image_token_id
self.use_image_special_tokens = use_image_special_tokens
self.image_start_token = tokenizer.image_start_token self.image_start_token = tokenizer.image_start_token
self.image_end_token = tokenizer.image_end_token self.image_end_token = tokenizer.image_end_token
self.image_thumbnail_token = tokenizer.image_thumbnail self.image_thumbnail_token = tokenizer.image_thumbnail

View File

@ -308,7 +308,7 @@ def get_best_fit(
return optimal_canvas return optimal_canvas
class Llama4ImageProcessorKwargs(ImagesKwargs): class Llama4ImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
max_patches (`int`, *optional*, defaults to 16): max_patches (`int`, *optional*, defaults to 16):
The maximum number of patches to be extracted from the image. The maximum number of patches to be extracted from the image.
@ -320,8 +320,8 @@ class Llama4ImageProcessorKwargs(ImagesKwargs):
but never upsample, unless the image is smaller than the patch size. but never upsample, unless the image is smaller than the patch size.
""" """
max_patches: Optional[int] max_patches: int
resize_to_max_canvas: Optional[bool] resize_to_max_canvas: bool
@auto_docstring @auto_docstring

View File

@ -59,7 +59,7 @@ if is_vision_available():
from PIL import Image from PIL import Image
class LlavaNextImageProcessorKwargs(ImagesKwargs): class LlavaNextImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
image_grid_pinpoints (`list[list[int]]`, *optional*): image_grid_pinpoints (`list[list[int]]`, *optional*):
A list of possible resolutions to use for processing high resolution images. The best resolution is selected A list of possible resolutions to use for processing high resolution images. The best resolution is selected
@ -67,7 +67,7 @@ class LlavaNextImageProcessorKwargs(ImagesKwargs):
method. method.
""" """
image_grid_pinpoints: Optional[list[list[int]]] image_grid_pinpoints: list[list[int]]
def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]:

View File

@ -58,7 +58,7 @@ if is_vision_available():
from PIL import Image from PIL import Image
class LlavaOnevisionImageProcessorKwargs(ImagesKwargs): class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
image_grid_pinpoints (`list[list[int]]`, *optional*): image_grid_pinpoints (`list[list[int]]`, *optional*):
A list of possible resolutions to use for processing high resolution images. The best resolution is selected A list of possible resolutions to use for processing high resolution images. The best resolution is selected
@ -66,7 +66,7 @@ class LlavaOnevisionImageProcessorKwargs(ImagesKwargs):
method. method.
""" """
image_grid_pinpoints: Optional[list[list[int]]] image_grid_pinpoints: list[list[int]]
# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches

View File

@ -76,8 +76,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
batch_num_images = [1] * len(images) batch_num_images = [1] * len(images)
else: else:
batch_num_images = [1] batch_num_images = [1]
kwargs["batch_num_images"] = batch_num_images return super().preprocess(images, batch_num_images, **kwargs)
return super().preprocess(images, **kwargs)
def _resize_for_patching( def _resize_for_patching(
self, self,
@ -202,6 +201,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
def _preprocess( def _preprocess(
self, self,
images: list["torch.Tensor"], images: list["torch.Tensor"],
batch_num_images: list[int],
do_resize: bool, do_resize: bool,
size: SizeDict, size: SizeDict,
image_grid_pinpoints: list[list[int]], image_grid_pinpoints: list[list[int]],
@ -214,7 +214,6 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
image_mean: Optional[Union[float, list[float]]], image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]], image_std: Optional[Union[float, list[float]]],
do_pad: bool, do_pad: bool,
batch_num_images: list[int],
disable_grouping: Optional[bool], disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]], return_tensors: Optional[Union[str, TensorType]],
**kwargs, **kwargs,

View File

@ -35,7 +35,7 @@ from transformers.models.llava_next_video.modeling_llava_next_video import (
from ...cache_utils import Cache from ...cache_utils import Cache
from ...image_processing_utils import BatchFeature from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import group_images_by_shape, reorder_images from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
from ...image_utils import ( from ...image_utils import (
OPENAI_CLIP_MEAN, OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD, OPENAI_CLIP_STD,
@ -128,12 +128,12 @@ class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
batch_num_images = [1] * len(images) batch_num_images = [1] * len(images)
else: else:
batch_num_images = [1] batch_num_images = [1]
kwargs["batch_num_images"] = batch_num_images return BaseImageProcessorFast.preprocess(images, batch_num_images, **kwargs)
return super().preprocess(images, **kwargs)
def _preprocess( def _preprocess(
self, self,
images: list["torch.Tensor"], images: list["torch.Tensor"],
batch_num_images: list[int],
do_resize: bool, do_resize: bool,
size: SizeDict, size: SizeDict,
image_grid_pinpoints: list[list[int]], image_grid_pinpoints: list[list[int]],
@ -146,7 +146,6 @@ class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
image_mean: Optional[Union[float, list[float]]], image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]], image_std: Optional[Union[float, list[float]]],
do_pad: bool, do_pad: bool,
batch_num_images: list[int],
disable_grouping: Optional[bool], disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]], return_tensors: Optional[Union[str, TensorType]],
**kwargs, **kwargs,

View File

@ -61,7 +61,7 @@ if is_torch_available():
from torch import nn from torch import nn
class Mask2FormerImageProcessorKwargs(ImagesKwargs): class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
ignore_index (`int`, *optional*): ignore_index (`int`, *optional*):
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
@ -74,9 +74,9 @@ class Mask2FormerImageProcessorKwargs(ImagesKwargs):
The number of labels in the segmentation map. The number of labels in the segmentation map.
""" """
size_divisor: Optional[int] size_divisor: int
ignore_index: Optional[int] ignore_index: Optional[int]
do_reduce_labels: Optional[bool] do_reduce_labels: bool
num_labels: Optional[int] num_labels: Optional[int]

View File

@ -67,7 +67,7 @@ if is_torch_available():
from torch import nn from torch import nn
class MaskFormerImageProcessorKwargs(ImagesKwargs): class MaskFormerImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
ignore_index (`int`, *optional*): ignore_index (`int`, *optional*):
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
@ -80,9 +80,9 @@ class MaskFormerImageProcessorKwargs(ImagesKwargs):
The number of labels in the segmentation map. The number of labels in the segmentation map.
""" """
size_divisor: Optional[int] size_divisor: int
ignore_index: Optional[int] ignore_index: Optional[int]
do_reduce_labels: Optional[bool] do_reduce_labels: bool
num_labels: Optional[int] num_labels: Optional[int]

View File

@ -50,13 +50,13 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class MllamaImageProcessorKwargs(ImagesKwargs): class MllamaImageProcessorKwargs(ImagesKwargs, total=False):
""" """
max_image_tiles (`int`, *optional*): max_image_tiles (`int`, *optional*):
The maximum number of tiles allowed. The maximum number of tiles allowed.
""" """
max_image_tiles: Optional[int] max_image_tiles: int
@lru_cache(maxsize=10) @lru_cache(maxsize=10)

View File

@ -258,9 +258,7 @@ class MllamaProcessor(ProcessorMixin):
tokenizer_init_kwargs=self.tokenizer.init_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs, **kwargs,
) )
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
images_kwargs = output_kwargs["images_kwargs"]
data = {} data = {}
if text is not None: if text is not None:
@ -306,7 +304,7 @@ class MllamaProcessor(ProcessorMixin):
) )
if images is not None: if images is not None:
image_features = self.image_processor(images, **images_kwargs) image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
num_tiles = image_features.pop("num_tiles") num_tiles = image_features.pop("num_tiles")
data.update(image_features) data.update(image_features)

View File

@ -51,7 +51,7 @@ from ...utils.import_utils import requires
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class MobileNetV2ImageProcessorKwargs(ImagesKwargs): class MobileNetV2ImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
@ -59,7 +59,7 @@ class MobileNetV2ImageProcessorKwargs(ImagesKwargs):
ADE20k). The background label will be replaced by 255. ADE20k). The background label will be replaced by 255.
""" """
do_reduce_labels: Optional[bool] do_reduce_labels: bool
@requires(backends=("vision",)) @requires(backends=("vision",))

View File

@ -53,7 +53,7 @@ if is_torch_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class MobileVitImageProcessorKwargs(ImagesKwargs): class MobileVitImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`): do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
Whether to flip the color channels from RGB to BGR or vice versa. Whether to flip the color channels from RGB to BGR or vice versa.
@ -63,8 +63,8 @@ class MobileVitImageProcessorKwargs(ImagesKwargs):
ADE20k). The background label will be replaced by 255. ADE20k). The background label will be replaced by 255.
""" """
do_flip_channel_order: Optional[bool] do_flip_channel_order: bool
do_reduce_labels: Optional[bool] do_reduce_labels: bool
@requires(backends=("vision",)) @requires(backends=("vision",))

View File

@ -52,7 +52,7 @@ if is_vision_available():
import PIL import PIL
class NougatImageProcessorKwargs(ImagesKwargs): class NougatImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
do_crop_margin (`bool`, *optional*, defaults to `True`): do_crop_margin (`bool`, *optional*, defaults to `True`):
Whether to crop the image margins. Whether to crop the image margins.
@ -62,9 +62,9 @@ class NougatImageProcessorKwargs(ImagesKwargs):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
""" """
do_crop_margin: Optional[bool] do_crop_margin: bool
do_thumbnail: Optional[bool] do_thumbnail: bool
do_align_long_axis: Optional[bool] do_align_long_axis: bool
class NougatImageProcessor(BaseImageProcessor): class NougatImageProcessor(BaseImageProcessor):

View File

@ -64,7 +64,7 @@ if is_torch_available():
from torch import nn from torch import nn
class OneFormerImageProcessorKwargs(ImagesKwargs): class OneFormerImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`): repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`):
Path to a local directory or Hugging Face Hub repository containing model metadata. Path to a local directory or Hugging Face Hub repository containing model metadata.
@ -85,7 +85,7 @@ class OneFormerImageProcessorKwargs(ImagesKwargs):
num_text: Optional[int] num_text: Optional[int]
num_labels: Optional[int] num_labels: Optional[int]
ignore_index: Optional[int] ignore_index: Optional[int]
do_reduce_labels: Optional[bool] do_reduce_labels: bool
# Copied from transformers.models.detr.image_processing_detr.max_across_indices # Copied from transformers.models.detr.image_processing_detr.max_across_indices

View File

@ -44,7 +44,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Ovis2ImageProcessorKwargs(ImagesKwargs): class Ovis2ImageProcessorKwargs(ImagesKwargs, total=False):
""" """
crop_to_patches (`bool`, *optional*, defaults to `False`): crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
@ -61,10 +61,10 @@ class Ovis2ImageProcessorKwargs(ImagesKwargs):
`preprocess` method. `preprocess` method.
""" """
crop_to_patches: Optional[bool] crop_to_patches: bool
min_patches: Optional[int] min_patches: int
max_patches: Optional[int] max_patches: int
use_covering_area_grid: Optional[bool] use_covering_area_grid: bool
# Similar to image_processing_mllama.get_all_supported_aspect_ratios # Similar to image_processing_mllama.get_all_supported_aspect_ratios

View File

@ -42,7 +42,7 @@ from ...utils import (
) )
class PerceptionLMImageProcessorKwargs(ImagesKwargs): class PerceptionLMImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`): vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`):
Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for
@ -54,8 +54,8 @@ class PerceptionLMImageProcessorKwargs(ImagesKwargs):
""" """
vision_input_type: Optional[str] vision_input_type: Optional[str]
tile_size: Optional[int] tile_size: int
max_num_tiles: Optional[int] max_num_tiles: int
@auto_docstring @auto_docstring
@ -68,7 +68,7 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
do_rescale = True do_rescale = True
do_normalize = True do_normalize = True
do_convert_rgb = True do_convert_rgb = True
vision_input_type = "thumb+tail" vision_input_type = "thumb+tile"
tile_size = 448 tile_size = 448
max_num_tiles = 36 max_num_tiles = 36
size = {"width": 448, "height": 448} # for backward compatibility in tests size = {"width": 448, "height": 448} # for backward compatibility in tests

View File

@ -35,7 +35,7 @@ from ...utils import (
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Phi4MultimodalImageProcessorKwargs(ImagesKwargs): class Phi4MultimodalImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
patch_size (`int`, *optional*): patch_size (`int`, *optional*):
The size of the patch. The size of the patch.
@ -43,8 +43,8 @@ class Phi4MultimodalImageProcessorKwargs(ImagesKwargs):
The maximum number of crops per image. The maximum number of crops per image.
""" """
patch_size: Optional[int] patch_size: int
dynamic_hd: Optional[int] dynamic_hd: int
@auto_docstring @auto_docstring

View File

@ -49,7 +49,7 @@ logger = logging.get_logger(__name__)
DEFAULT_FONT_PATH = "ybelkada/fonts" DEFAULT_FONT_PATH = "ybelkada/fonts"
class Pix2StructImageProcessorKwargs(ImagesKwargs): class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
""" """
max_patches (`int`, *optional*): max_patches (`int`, *optional*):
Maximum number of patches to extract. Maximum number of patches to extract.
@ -57,7 +57,7 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs):
Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`. Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
""" """
max_patches: Optional[int] max_patches: int
header_text: Optional[Union[list[str], str]] header_text: Optional[Union[list[str], str]]

View File

@ -50,13 +50,13 @@ if is_vision_available():
import PIL import PIL
class PixtralImageProcessorKwargs(ImagesKwargs): class PixtralImageProcessorKwargs(ImagesKwargs, total=False):
""" """
patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`): patch_size (`Union[dict[str, int], int]` *optional*, defaults to `{"height": 16, "width": 16}`):
Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
""" """
patch_size: Optional[dict[str, int]] patch_size: Union[dict[str, int], int]
# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white. # Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.

View File

@ -48,13 +48,13 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class PoolFormerImageProcessorKwargs(ImagesKwargs): class PoolFormerImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
crop_pct (`float`, *optional*, defaults to `self.crop_pct`): crop_pct (`float`, *optional*, defaults to `self.crop_pct`):
Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`. Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
""" """
crop_pct: Optional[float] crop_pct: float
class PoolFormerImageProcessor(BaseImageProcessor): class PoolFormerImageProcessor(BaseImageProcessor):

View File

@ -54,7 +54,7 @@ if is_torch_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs): class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
keep_aspect_ratio (`bool`, *optional*): keep_aspect_ratio (`bool`, *optional*):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
@ -64,10 +64,10 @@ class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs):
Scale factor to convert the prompt depth to meters. Scale factor to convert the prompt depth to meters.
""" """
keep_aspect_ratio: Optional[bool] keep_aspect_ratio: bool
ensure_multiple_of: Optional[int] ensure_multiple_of: int
size_divisor: Optional[int] size_divisor: int
prompt_scale_to_meter: Optional[float] prompt_scale_to_meter: float
def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):

View File

@ -32,17 +32,17 @@ from ...video_utils import VideoInput
# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
# and does not use them in video processor class # and does not use them in video processor class
class Qwen2_5_OmniVideosKwargs(VideosKwargs): class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False):
min_pixels: Optional[int] min_pixels: int
max_pixels: Optional[int] max_pixels: int
patch_size: Optional[int] patch_size: int
temporal_patch_size: Optional[int] temporal_patch_size: int
merge_size: Optional[int] merge_size: int
min_frames: Optional[int] min_frames: int
max_frames: Optional[int] max_frames: int
use_audio_in_video: Optional[bool] use_audio_in_video: bool
seconds_per_chunk: Optional[float] seconds_per_chunk: float
position_id_per_seconds: Optional[int] position_id_per_seconds: Union[int, float]
class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False): class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):

View File

@ -52,7 +52,7 @@ from ...video_utils import VideoInput, make_batched_videos
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Qwen2VLImageProcessorKwargs(ImagesKwargs): class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
min_pixels (`int`, *optional*, defaults to `56 * 56`): min_pixels (`int`, *optional*, defaults to `56 * 56`):
The min pixels of the image to resize the image. The min pixels of the image to resize the image.
@ -66,11 +66,11 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs):
The merge size of the vision encoder to llm encoder. The merge size of the vision encoder to llm encoder.
""" """
min_pixels: Optional[int] min_pixels: int
max_pixels: Optional[int] max_pixels: int
patch_size: Optional[int] patch_size: int
temporal_patch_size: Optional[int] temporal_patch_size: int
merge_size: Optional[int] merge_size: int
def smart_resize( def smart_resize(

View File

@ -41,14 +41,14 @@ from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
from .image_processing_qwen2_vl import smart_resize from .image_processing_qwen2_vl import smart_resize
class Qwen2VLVideoProcessorInitKwargs(VideosKwargs): class Qwen2VLVideoProcessorInitKwargs(VideosKwargs, total=False):
min_pixels: Optional[int] min_pixels: int
max_pixels: Optional[int] max_pixels: int
patch_size: Optional[int] patch_size: int
temporal_patch_size: Optional[int] temporal_patch_size: int
merge_size: Optional[int] merge_size: int
min_frames: Optional[int] min_frames: int
max_frames: Optional[int] max_frames: int
@add_start_docstrings( @add_start_docstrings(

View File

@ -20,7 +20,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import re import re
from typing import Optional from typing import Union
import numpy as np import numpy as np
@ -34,17 +34,17 @@ from ...video_utils import VideoInput, make_batched_videos
# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
# and does not use them in video processor class # and does not use them in video processor class
class Qwen3OmniMoeVideosKwargs(VideosKwargs): class Qwen3OmniMoeVideosKwargs(VideosKwargs, total=False):
min_pixels: Optional[int] min_pixels: int
max_pixels: Optional[int] max_pixels: int
patch_size: Optional[int] patch_size: int
temporal_patch_size: Optional[int] temporal_patch_size: int
merge_size: Optional[int] merge_size: int
min_frames: Optional[int] min_frames: int
max_frames: Optional[int] max_frames: int
use_audio_in_video: Optional[bool] use_audio_in_video: bool
seconds_per_chunk: Optional[float] seconds_per_chunk: float
position_id_per_seconds: Optional[int] position_id_per_seconds: Union[int, float]
class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False): class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):

View File

@ -64,12 +64,12 @@ def smart_resize(
return h_bar, w_bar return h_bar, w_bar
class Qwen3VLVideoProcessorInitKwargs(VideosKwargs): class Qwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False):
patch_size: Optional[int] patch_size: int
temporal_patch_size: Optional[int] temporal_patch_size: int
merge_size: Optional[int] merge_size: int
min_frames: Optional[int] min_frames: int
max_frames: Optional[int] max_frames: int
@add_start_docstrings( @add_start_docstrings(

View File

@ -68,7 +68,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
class RTDetrImageProcessorKwargs(ImagesKwargs): class RTDetrImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic". Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@ -84,9 +84,9 @@ class RTDetrImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks. Path to the directory containing the segmentation masks.
""" """
format: Optional[Union[str, AnnotationFormat]] format: Union[str, AnnotationFormat]
do_convert_annotations: Optional[bool] do_convert_annotations: bool
return_segmentation_masks: Optional[bool] return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]] masks_path: Optional[Union[str, pathlib.Path]]

View File

@ -58,7 +58,7 @@ if is_torchvision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class SamImageProcessorKwargs(ImagesKwargs): class SamImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
mask_size (`dict[str, int]`, *optional*): mask_size (`dict[str, int]`, *optional*):
The size `{"longest_edge": int}` to resize the segmentation maps to. The size `{"longest_edge": int}` to resize the segmentation maps to.
@ -67,8 +67,8 @@ class SamImageProcessorKwargs(ImagesKwargs):
map size provided for preprocessing. map size provided for preprocessing.
""" """
mask_size: Optional[dict[str, int]] mask_size: dict[str, int]
mask_pad_size: Optional[dict[str, int]] mask_pad_size: dict[str, int]
class SamImageProcessor(BaseImageProcessor): class SamImageProcessor(BaseImageProcessor):

View File

@ -31,14 +31,14 @@ if is_torch_available():
import torch import torch
class SamImagesKwargs(ImagesKwargs): class SamImagesKwargs(ImagesKwargs, total=False):
segmentation_maps: Optional[ImageInput] segmentation_maps: Optional[ImageInput]
input_points: Optional[list[list[float]]] input_points: Optional[list[list[float]]]
input_labels: Optional[list[list[int]]] input_labels: Optional[list[list[int]]]
input_boxes: Optional[list[list[list[float]]]] input_boxes: Optional[list[list[list[float]]]]
point_pad_value: Optional[int] point_pad_value: int
mask_size: Optional[dict[str, int]] mask_size: dict[str, int]
mask_pad_size: Optional[dict[str, int]] mask_pad_size: dict[str, int]
class SamProcessorKwargs(ProcessingKwargs, total=False): class SamProcessorKwargs(ProcessingKwargs, total=False):

View File

@ -43,13 +43,13 @@ from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring from ...utils import TensorType, auto_docstring
class Sam2FastImageProcessorKwargs(ImagesKwargs): class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
mask_size (`dict[str, int]`, *optional*): mask_size (`dict[str, int]`, *optional*):
The size `{"height": int, "width": int}` to resize the segmentation maps to. The size `{"height": int, "width": int}` to resize the segmentation maps to.
""" """
mask_size: Optional[dict[str, int]] mask_size: dict[str, int]
def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int): def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):

View File

@ -70,13 +70,13 @@ from .configuration_sam2 import (
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Sam2FastImageProcessorKwargs(ImagesKwargs): class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
mask_size (`dict[str, int]`, *optional*): mask_size (`dict[str, int]`, *optional*):
The size `{"height": int, "width": int}` to resize the segmentation maps to. The size `{"height": int, "width": int}` to resize the segmentation maps to.
""" """
mask_size: Optional[dict[str, int]] mask_size: dict[str, int]
@auto_docstring @auto_docstring

View File

@ -31,14 +31,14 @@ if is_torch_available():
import torch import torch
class SamHQImagesKwargs(ImagesKwargs): class SamHQImagesKwargs(ImagesKwargs, total=False):
segmentation_maps: Optional[ImageInput] segmentation_maps: Optional[ImageInput]
input_points: Optional[list[list[float]]] input_points: Optional[list[list[float]]]
input_labels: Optional[list[list[int]]] input_labels: Optional[list[list[int]]]
input_boxes: Optional[list[list[list[float]]]] input_boxes: Optional[list[list[list[float]]]]
point_pad_value: Optional[int] point_pad_value: Optional[int]
mask_size: Optional[dict[str, int]] mask_size: dict[str, int]
mask_pad_size: Optional[dict[str, int]] mask_pad_size: dict[str, int]
class SamHQProcessorKwargs(ProcessingKwargs, total=False): class SamHQProcessorKwargs(ProcessingKwargs, total=False):

View File

@ -55,7 +55,7 @@ if is_torch_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class SegformerImageProcessorKwargs(ImagesKwargs): class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
@ -63,7 +63,7 @@ class SegformerImageProcessorKwargs(ImagesKwargs):
ADE20k). The background label will be replaced by 255. ADE20k). The background label will be replaced by 255.
""" """
do_reduce_labels: Optional[bool] do_reduce_labels: bool
@requires(backends=("vision",)) @requires(backends=("vision",))

View File

@ -48,7 +48,7 @@ if is_vision_available():
from PIL import Image from PIL import Image
class Siglip2ImageProcessorKwargs(ImagesKwargs): class Siglip2ImageProcessorKwargs(ImagesKwargs, total=False):
""" """
patch_size (`int`, *optional*, defaults to 16): patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch the image will be split to. The size (resolution) of each patch the image will be split to.
@ -57,8 +57,8 @@ class Siglip2ImageProcessorKwargs(ImagesKwargs):
and then padded in "patch" dimension to match this number exactly. and then padded in "patch" dimension to match this number exactly.
""" """
patch_size: Optional[int] patch_size: int
max_num_patches: Optional[int] max_num_patches: int
@lru_cache(maxsize=256) @lru_cache(maxsize=256)

View File

@ -53,7 +53,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class SmolVLMImageProcessorKwargs(ImagesKwargs): class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
""" """
do_image_splitting (`bool`, *optional*, defaults to `True`): do_image_splitting (`bool`, *optional*, defaults to `True`):
Whether to split the image into sub-images concatenated with the original image. They are split into patches Whether to split the image into sub-images concatenated with the original image. They are split into patches
@ -64,9 +64,9 @@ class SmolVLMImageProcessorKwargs(ImagesKwargs):
Whether to return the row and column information of the images. Whether to return the row and column information of the images.
""" """
do_image_splitting: Optional[bool] do_image_splitting: bool
max_image_size: Optional[dict[str, int]] max_image_size: dict[str, int]
return_row_col_info: Optional[bool] return_row_col_info: bool
MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum

View File

@ -90,8 +90,8 @@ def get_resize_output_image_size(
return height, width return height, width
class SmolVLMVideoProcessorInitKwargs(VideosKwargs): class SmolVLMVideoProcessorInitKwargs(VideosKwargs, total=False):
max_image_size: Optional[dict[str, int]] max_image_size: dict[str, int]
class SmolVLMVideoProcessor(BaseVideoProcessor): class SmolVLMVideoProcessor(BaseVideoProcessor):

View File

@ -46,13 +46,13 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class SuperPointImageProcessorKwargs(ImagesKwargs): class SuperPointImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
do_grayscale (`bool`, *optional*, defaults to `True`): do_grayscale (`bool`, *optional*, defaults to `True`):
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
""" """
do_grayscale: Optional[bool] = True do_grayscale: bool
def is_grayscale( def is_grayscale(

View File

@ -38,8 +38,8 @@ from ...utils.deprecation import deprecate_kwarg
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Swin2SRImageProcessorKwargs(ImagesKwargs): class Swin2SRImageProcessorKwargs(ImagesKwargs, total=False):
size_divisor: Optional[int] size_divisor: int
class Swin2SRImageProcessor(BaseImageProcessor): class Swin2SRImageProcessor(BaseImageProcessor):

View File

@ -49,8 +49,8 @@ if is_vision_available():
import PIL import PIL
class TextNetImageProcessorKwargs(ImagesKwargs): class TextNetImageProcessorKwargs(ImagesKwargs, total=False):
size_divisor: Optional[int] size_divisor: int
class TextNetImageProcessor(BaseImageProcessor): class TextNetImageProcessor(BaseImageProcessor):

View File

@ -50,7 +50,7 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class TvpImageProcessorKwargs(ImagesKwargs): class TvpImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
do_flip_channel_order (`bool`, *optional*): do_flip_channel_order (`bool`, *optional*):
Whether to flip the channel order of the image from RGB to BGR. Whether to flip the channel order of the image from RGB to BGR.
@ -60,7 +60,7 @@ class TvpImageProcessorKwargs(ImagesKwargs):
Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`. Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`.
""" """
do_flip_channel_order: Optional[bool] do_flip_channel_order: bool
constant_values: Optional[Union[float, list[float]]] constant_values: Optional[Union[float, list[float]]]
pad_mode: Optional[str] pad_mode: Optional[str]

View File

@ -31,7 +31,7 @@ logger = logging.get_logger(__name__)
class UdopTextKwargs(TextKwargs, total=False): class UdopTextKwargs(TextKwargs, total=False):
word_labels: Optional[Union[list[int], list[list[int]]]] word_labels: Optional[Union[list[int], list[list[int]]]]
boxes: Union[list[list[int]], list[list[list[int]]]] boxes: Optional[Union[list[list[int]], list[list[list[int]]]]]
class UdopProcessorKwargs(ProcessingKwargs, total=False): class UdopProcessorKwargs(ProcessingKwargs, total=False):

View File

@ -47,8 +47,8 @@ if is_vision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class ViltImageProcessorKwargs(ImagesKwargs): class ViltImageProcessorKwargs(ImagesKwargs, total=False):
size_divisor: Optional[int] size_divisor: int
def max_across_indices(values: Iterable[Any]) -> list[Any]: def max_across_indices(values: Iterable[Any]) -> list[Any]:

View File

@ -41,8 +41,8 @@ from ...utils.deprecation import deprecate_kwarg
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class VitMatteImageProcessorKwargs(ImagesKwargs): class VitMatteImageProcessorKwargs(ImagesKwargs, total=False):
size_divisor: Optional[int] size_divisor: int
class VitMatteImageProcessor(BaseImageProcessor): class VitMatteImageProcessor(BaseImageProcessor):

View File

@ -81,7 +81,7 @@ logger = logging.get_logger(__name__)
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
class YolosImageProcessorKwargs(ImagesKwargs): class YolosImageProcessorKwargs(ImagesKwargs, total=False):
r""" r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic". Data format of the annotations. One of "coco_detection" or "coco_panoptic".
@ -97,9 +97,9 @@ class YolosImageProcessorKwargs(ImagesKwargs):
Path to the directory containing the segmentation masks. Path to the directory containing the segmentation masks.
""" """
format: Optional[Union[str, AnnotationFormat]] format: Union[str, AnnotationFormat]
do_convert_annotations: Optional[bool] do_convert_annotations: bool
return_segmentation_masks: Optional[bool] return_segmentation_masks: bool
annotations: Optional[Union[AnnotationType, list[AnnotationType]]] annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
masks_path: Optional[Union[str, pathlib.Path]] masks_path: Optional[Union[str, pathlib.Path]]

View File

@ -62,7 +62,7 @@ if is_torch_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class ZoeDepthImageProcessorKwargs(ImagesKwargs): class ZoeDepthImageProcessorKwargs(ImagesKwargs, total=False):
""" """
keep_aspect_ratio (`bool`, *optional*, defaults to `True`): keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
@ -77,8 +77,8 @@ class ZoeDepthImageProcessorKwargs(ImagesKwargs):
Can be overridden by `ensure_multiple_of` in `preprocess`. Can be overridden by `ensure_multiple_of` in `preprocess`.
""" """
keep_aspect_ratio: Optional[bool] keep_aspect_ratio: bool
ensure_multiple_of: Optional[int] ensure_multiple_of: int
def get_resize_output_image_size( def get_resize_output_image_size(

View File

@ -25,10 +25,11 @@ import typing
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any, Optional, TypedDict, TypeVar, Union from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
import numpy as np import numpy as np
import typing_extensions import typing_extensions
from huggingface_hub.dataclasses import validate_typed_dict
from huggingface_hub.errors import EntryNotFoundError from huggingface_hub.errors import EntryNotFoundError
from .audio_utils import AudioInput, load_audio from .audio_utils import AudioInput, load_audio
@ -36,13 +37,23 @@ from .dynamic_module_utils import custom_object_save
from .feature_extraction_utils import BatchFeature from .feature_extraction_utils import BatchFeature
from .image_utils import ChannelDimension, ImageInput, is_vision_available from .image_utils import ChannelDimension, ImageInput, is_vision_available
from .utils.chat_template_utils import render_jinja_template from .utils.chat_template_utils import render_jinja_template
from .video_utils import VideoInput, VideoMetadata from .utils.type_validators import (
device_validator,
image_size_validator,
padding_validator,
positive_any_number,
positive_int,
resampling_validator,
tensor_type_validator,
truncation_validator,
video_metadata_validator,
)
from .video_utils import VideoInput, VideoMetadataType
if is_vision_available(): if is_vision_available():
from .image_utils import PILImageResampling from .image_utils import PILImageResampling
from .tokenization_utils_base import ( from .tokenization_utils_base import (
PaddingStrategy, PaddingStrategy,
PreTokenizedInput, PreTokenizedInput,
@ -72,8 +83,6 @@ from .utils.deprecation import deprecate_kwarg
if is_torch_available(): if is_torch_available():
import torch
from .modeling_utils import PreTrainedAudioTokenizerBase from .modeling_utils import PreTrainedAudioTokenizerBase
@ -137,18 +146,22 @@ class TextKwargs(TypedDict, total=False):
The side on which padding will be applied. The side on which padding will be applied.
return_mm_token_type_ids (`bool`, *optional*): return_mm_token_type_ids (`bool`, *optional*):
Whether to return multimodal token type ids indicating mm placeholder token positions. Whether to return multimodal token type ids indicating mm placeholder token positions.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
""" """
text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
add_special_tokens: Optional[bool] add_special_tokens: Optional[bool]
padding: Union[bool, str, PaddingStrategy] padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
truncation: Union[bool, str, TruncationStrategy] truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
max_length: Optional[int] max_length: Annotated[Optional[int], positive_int()]
stride: Optional[int] stride: Annotated[Optional[int], positive_int()]
is_split_into_words: Optional[bool] is_split_into_words: Optional[bool]
pad_to_multiple_of: Optional[int] pad_to_multiple_of: Annotated[Optional[int], positive_int()]
return_token_type_ids: Optional[bool] return_token_type_ids: Optional[bool]
return_attention_mask: Optional[bool] return_attention_mask: Optional[bool]
return_overflowing_tokens: Optional[bool] return_overflowing_tokens: Optional[bool]
@ -156,9 +169,9 @@ class TextKwargs(TypedDict, total=False):
return_offsets_mapping: Optional[bool] return_offsets_mapping: Optional[bool]
return_length: Optional[bool] return_length: Optional[bool]
verbose: Optional[bool] verbose: Optional[bool]
padding_side: Optional[str] padding_side: Optional[Literal["left", "right"]]
return_mm_token_type_ids: Optional[bool] return_mm_token_type_ids: Optional[bool]
return_tensors: Optional[Union[str, TensorType]] return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
class ImagesKwargs(TypedDict, total=False): class ImagesKwargs(TypedDict, total=False):
@ -175,6 +188,8 @@ class ImagesKwargs(TypedDict, total=False):
Resize the shorter side of the input to `size["shortest_edge"]`. Resize the shorter side of the input to `size["shortest_edge"]`.
crop_size (`dict[str, int]`, *optional*): crop_size (`dict[str, int]`, *optional*):
Desired output size when applying center-cropping. Desired output size when applying center-cropping.
do_convert_rgb (`bool`):
Whether to convert the video to RGB format.
resample (`PILImageResampling`, *optional*): resample (`PILImageResampling`, *optional*):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*): do_rescale (`bool`, *optional*):
@ -183,9 +198,9 @@ class ImagesKwargs(TypedDict, total=False):
Scale factor to use if rescaling the image. Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*): do_normalize (`bool`, *optional*):
Whether to normalize the image. Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*): image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
Mean to use if normalizing the image. Mean to use if normalizing the image.
image_std (`float` or `list[float]`, *optional*): image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
Standard deviation to use if normalizing the image. Standard deviation to use if normalizing the image.
do_pad (`bool`, *optional*): do_pad (`bool`, *optional*):
Whether to pad the images in the batch. Whether to pad the images in the batch.
@ -199,28 +214,32 @@ class ImagesKwargs(TypedDict, total=False):
The channel dimension format for the input image. The channel dimension format for the input image.
device (`Union[str, torch.Tensor]`, *optional*): device (`Union[str, torch.Tensor]`, *optional*):
The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing. The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
disable_grouping (`bool`, *optional*): disable_grouping (`bool`, *optional*):
Whether to group images by shapes when processing or not, only relevant for fast image processing. Whether to group images by shapes when processing or not, only relevant for fast image processing.
""" """
do_convert_rgb: Optional[bool] do_convert_rgb: Optional[bool]
do_resize: Optional[bool] do_resize: Optional[bool]
size: Optional[dict[str, int]] size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
crop_size: Optional[dict[str, int]] crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
resample: Optional[Union["PILImageResampling", int]] resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
do_rescale: Optional[bool] do_rescale: Optional[bool]
rescale_factor: Optional[float] rescale_factor: Optional[float]
do_normalize: Optional[bool] do_normalize: Optional[bool]
image_mean: Optional[Union[float, list[float]]] image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
image_std: Optional[Union[float, list[float]]] image_std: Optional[Union[float, list[float], tuple[float, ...]]]
do_pad: Optional[bool] do_pad: Optional[bool]
pad_size: Optional[dict[str, int]] pad_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
do_center_crop: Optional[bool] do_center_crop: Optional[bool]
data_format: Optional[ChannelDimension] data_format: Optional[Union[str, ChannelDimension]]
input_data_format: Optional[Union[str, ChannelDimension]] input_data_format: Optional[Union[str, ChannelDimension]]
device: Optional[Union[str, "torch.device"]] device: Annotated[Optional[str], device_validator()]
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
disable_grouping: Optional[bool] disable_grouping: Optional[bool]
return_tensors: Optional[Union[str, TensorType]]
class VideosKwargs(TypedDict, total=False): class VideosKwargs(TypedDict, total=False):
@ -244,9 +263,9 @@ class VideosKwargs(TypedDict, total=False):
Scale factor to use if rescaling the video. Scale factor to use if rescaling the video.
do_normalize (`bool`, *optional*): do_normalize (`bool`, *optional*):
Whether to normalize the video. Whether to normalize the video.
image_mean (`float` or `list[float]`, *optional*): image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
Mean to use if normalizing the video. Mean to use if normalizing the video.
image_std (`float` or `list[float]`, *optional*): image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
Standard deviation to use if normalizing the video. Standard deviation to use if normalizing the video.
do_center_crop (`bool`, *optional*): do_center_crop (`bool`, *optional*):
Whether to center crop the video. Whether to center crop the video.
@ -268,32 +287,36 @@ class VideosKwargs(TypedDict, total=False):
The channel dimension format for the input video. The channel dimension format for the input video.
device (`Union[str, torch.Tensor]`, *optional*): device (`Union[str, torch.Tensor]`, *optional*):
The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing. The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
return_metadata (`ChannelDimension` or `str`, *optional*): return_metadata (`bool`, *optional*):
Whether to return video metadata or not. Whether to return video metadata or not.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
""" """
do_convert_rgb: Optional[bool] do_convert_rgb: Optional[bool]
do_resize: Optional[bool] do_resize: Optional[bool]
size: Optional[dict[str, int]] size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
default_to_square: Optional[bool] default_to_square: Optional[bool]
resample: Optional["PILImageResampling"] resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
do_rescale: Optional[bool] do_rescale: Optional[bool]
rescale_factor: Optional[float] rescale_factor: Optional[float]
do_normalize: Optional[bool] do_normalize: Optional[bool]
image_mean: Optional[Union[float, list[float]]] image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
image_std: Optional[Union[float, list[float]]] image_std: Optional[Union[float, list[float], tuple[float, ...]]]
do_center_crop: Optional[bool] do_center_crop: Optional[bool]
do_pad: Optional[bool] do_pad: Optional[bool]
crop_size: Optional[dict[str, int]] crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
data_format: Optional[ChannelDimension] data_format: Optional[Union[str, ChannelDimension]]
input_data_format: Optional[Union[str, ChannelDimension]] input_data_format: Optional[Union[str, ChannelDimension]]
device: Optional[Union[str, "torch.device"]] device: Annotated[Optional[str], device_validator()]
do_sample_frames: Optional[bool] do_sample_frames: Optional[bool]
video_metadata: Optional[Union[VideoMetadata, dict]] video_metadata: Annotated[Optional[VideoMetadataType], video_metadata_validator()]
fps: Optional[Union[int, float]] fps: Annotated[Optional[Union[int, float]], positive_any_number()]
num_frames: Optional[int] num_frames: Annotated[Optional[int], positive_int()]
return_metadata: Optional[bool] return_metadata: Optional[bool]
return_tensors: Optional[Union[str, TensorType]] return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
class AudioKwargs(TypedDict, total=False): class AudioKwargs(TypedDict, total=False):
@ -324,16 +347,20 @@ class AudioKwargs(TypedDict, total=False):
If set, will pad the sequence to a multiple of the provided value. If set, will pad the sequence to a multiple of the provided value.
return_attention_mask (`bool`, *optional*): return_attention_mask (`bool`, *optional*):
Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
""" """
sampling_rate: Optional[int] sampling_rate: Annotated[Optional[int], positive_int()]
raw_speech: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]]
padding: Optional[Union[bool, str, PaddingStrategy]] padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
max_length: Optional[int] max_length: Annotated[Optional[int], positive_int()]
truncation: Optional[bool] truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
pad_to_multiple_of: Optional[int] pad_to_multiple_of: Annotated[Optional[int], positive_int()]
return_attention_mask: Optional[bool] return_attention_mask: Optional[bool]
return_tensors: Optional[Union[str, TensorType]] return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
class ProcessingKwargs(TypedDict, total=False): class ProcessingKwargs(TypedDict, total=False):
@ -1361,6 +1388,18 @@ class ProcessorMixin(PushToHubMixin):
f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
) )
for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items():
if key in map_preprocessor_kwargs:
preprocessor = getattr(self, map_preprocessor_kwargs[key], None)
if preprocessor is None or getattr(preprocessor, "valid_kwargs", None) is None:
continue
preprocessor_typed_dict_obj = getattr(preprocessor, "valid_kwargs")
typed_dict_obj = TypedDict(
"merged_typed_dict",
{**preprocessor_typed_dict_obj.__annotations__, **typed_dict_obj.__annotations__},
total=False,
)
validate_typed_dict(typed_dict_obj, output_kwargs[key])
return output_kwargs return output_kwargs
@classmethod @classmethod

View File

@ -0,0 +1,115 @@
from collections.abc import Sequence
from typing import Optional, Union
from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy
from ..video_utils import VideoMetadataType
from .generic import TensorType
from .import_utils import is_vision_available
if is_vision_available():
from ..image_utils import PILImageResampling
def positive_any_number(value: Optional[Union[int, float]] = None):
if value is not None and (not isinstance(value, (int, float)) or not value >= 0):
raise ValueError(f"Value must be a positive integer or floating number, got {value}")
def positive_int(value: Optional[int] = None):
if value is not None and (not isinstance(value, int) or not value >= 0):
raise ValueError(f"Value must be a positive integer, got {value}")
def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None):
possible_names = ["longest", "max_length", "do_not_pad"]
if value is None:
pass
elif not isinstance(value, (bool, str, PaddingStrategy)):
raise ValueError("Value for padding must be either a boolean, a string or a `PaddingStrategy`")
elif isinstance(value, str) and value not in possible_names:
raise ValueError(f"If padding is a string, the value must be one of {possible_names}")
def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = None):
possible_names = ["only_first", "only_second", "longest_first", "do_not_truncate"]
if value is None:
pass
elif not isinstance(value, (bool, str, TruncationStrategy)):
raise ValueError("Value for truncation must be either a boolean, a string or a `TruncationStrategy`")
elif isinstance(value, str) and value not in possible_names:
raise ValueError(f"If truncation is a string, value must be one of {possible_names}")
def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int]]] = None):
possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"]
if value is None:
pass
elif isinstance(value, dict) and any(k not in possible_keys for k in value.keys()):
raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}")
def device_validator(value: Optional[Union[str, int]] = None):
possible_names = ["cpu", "cuda", "xla", "xpu", "mps", "meta"]
if value is None:
pass
elif isinstance(value, int) and value < 0:
raise ValueError(
f"If device is an integer, the value must be a strictly positive integer but got device={value}"
)
elif isinstance(value, str) and value.split(":")[0] not in possible_names:
raise ValueError(f"If device is an string, the value must be one of {possible_names} but got device={value}")
elif not isinstance(value, (int, str)):
raise ValueError(
f"Device must be either an integer device ID or a string (e.g., 'cpu', 'cuda:0'), but got device={value}"
)
def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = None):
if value is None:
pass
elif isinstance(value, int) and value not in list(range(6)):
raise ValueError(
f"The resampling should be one of {list(range(6))} when provided as integer, but got resampling={value}"
)
elif is_vision_available() and not isinstance(value, (PILImageResampling, int)):
raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}")
def video_metadata_validator(value: Optional[VideoMetadataType] = None):
if value is None:
return
valid_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"]
def check_dict_keys(d: dict) -> bool:
return all(key in valid_keys for key in d.keys())
if isinstance(value, Sequence) and isinstance(value[0], Sequence) and isinstance(value[0][0], dict):
for sublist in value:
for item in sublist:
if not check_dict_keys(item):
raise ValueError(
f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}"
)
elif isinstance(value, Sequence) and isinstance(value[0], dict):
for item in value:
if not check_dict_keys(item):
raise ValueError(
f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}"
)
elif isinstance(value, dict):
if not check_dict_keys(value):
raise ValueError(
f"Invalid keys found in video metadata. Valid keys: {valid_keys}, got: {list(value.keys())}"
)
def tensor_type_validator(value: Optional[Union[str, TensorType]] = None):
possible_names = ["pt", "np", "mlx"]
if value is None:
pass
elif not isinstance(value, str) or value not in possible_names:
raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}")

View File

@ -21,6 +21,7 @@ from functools import partial
from typing import Any, Callable, Optional, Union from typing import Any, Callable, Optional, Union
import numpy as np import numpy as np
from huggingface_hub.dataclasses import validate_typed_dict
from .dynamic_module_utils import custom_object_save from .dynamic_module_utils import custom_object_save
from .image_processing_utils import ( from .image_processing_utils import (
@ -358,6 +359,10 @@ class BaseVideoProcessor(BaseImageProcessorFast):
captured_kwargs=kwargs.keys(), captured_kwargs=kwargs.keys(),
valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"], valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
) )
# Perform type validation on received kwargs
validate_typed_dict(self.valid_kwargs, kwargs)
# Set default kwargs from self. This ensures that if a kwarg is not provided # Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None. # by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self.valid_kwargs.__annotations__: for kwarg_name in self.valid_kwargs.__annotations__:

View File

@ -112,6 +112,11 @@ class VideoMetadata(Mapping):
setattr(self, key, value) setattr(self, key, value)
VideoMetadataType = Union[
VideoMetadata, dict, list[Union[dict, VideoMetadata]], list[list[Union[dict, VideoMetadata]]]
]
def is_valid_video_frame(frame): def is_valid_video_frame(frame):
return isinstance(frame, PIL.Image.Image) or ( return isinstance(frame, PIL.Image.Image) or (
(is_numpy_array(frame) or is_torch_tensor(frame)) and frame.ndim == 3 (is_numpy_array(frame) or is_torch_tensor(frame)) and frame.ndim == 3
@ -217,7 +222,7 @@ def make_batched_videos(videos) -> list[Union[np.ndarray, "torch.Tensor", "URL",
return flat_videos_list return flat_videos_list
def make_batched_metadata(videos: VideoInput, video_metadata: Union[VideoMetadata, dict]): def make_batched_metadata(videos: VideoInput, video_metadata: VideoMetadataType) -> list[VideoMetadata]:
if video_metadata is None: if video_metadata is None:
# Create default metadata and fill attributes we can infer from given video # Create default metadata and fill attributes we can infer from given video
video_metadata = [ video_metadata = [

View File

@ -176,8 +176,8 @@ class Cohere2VisionProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_inputs[0], image_inputs[0],
return_tensors="pt", return_tensors="pt",
input_data_format="channels_last", input_data_format="channels_last",
image_mean=0, image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=1, image_std=(1.0, 1.0, 1.0, 1.0),
).pixel_values ).pixel_values
self.assertEqual(tuple(encoded_images.shape), (10, 4, 30, 30)) self.assertEqual(tuple(encoded_images.shape), (10, 4, 30, 30))
@ -186,7 +186,7 @@ class Cohere2VisionProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_inputs, image_inputs,
return_tensors="pt", return_tensors="pt",
input_data_format="channels_last", input_data_format="channels_last",
image_mean=0, image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=1, image_std=(1.0, 1.0, 1.0, 1.0),
).pixel_values ).pixel_values
self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30)) self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30))

View File

@ -133,7 +133,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def test_image_processor_defaults_preserved_by_image_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self):
""" """
We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
We then check that the mean of the pixel_values is less than or equal to 0 after processing. We then check that the mean of the pixel_values is less than or equal to 0 after processing.
Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
""" """
@ -141,7 +141,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.skipTest(f"image_processor attribute not present in {self.processor_class}") self.skipTest(f"image_processor attribute not present in {self.processor_class}")
processor_components = self.prepare_components() processor_components = self.prepare_components()
processor_components["image_processor"] = self.get_component( processor_components["image_processor"] = self.get_component(
"image_processor", do_rescale=True, rescale_factor=-1 "image_processor", do_rescale=True, rescale_factor=-1.0
) )
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
@ -179,7 +179,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt")
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
def test_unstructured_kwargs(self): def test_unstructured_kwargs(self):
@ -194,7 +194,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
text=input_str, text=input_str,
return_tensors="pt", return_tensors="pt",
do_rescale=True, do_rescale=True,
rescale_factor=-1, rescale_factor=-1.0,
padding="max_length", padding="max_length",
max_length=76, max_length=76,
) )
@ -213,7 +213,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
images=image_input, images=image_input,
return_tensors="pt", return_tensors="pt",
do_rescale=True, do_rescale=True,
rescale_factor=-1, rescale_factor=-1.0,
padding="longest", padding="longest",
max_length=76, max_length=76,
) )
@ -231,7 +231,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = processor( _ = processor(
images=image_input, images=image_input,
images_kwargs={"do_rescale": True, "rescale_factor": -1}, images_kwargs={"do_rescale": True, "rescale_factor": -1.0},
do_rescale=True, do_rescale=True,
return_tensors="pt", return_tensors="pt",
) )
@ -248,7 +248,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Define the kwargs for each modality # Define the kwargs for each modality
all_kwargs = { all_kwargs = {
"common_kwargs": {"return_tensors": "pt"}, "common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"do_rescale": True, "rescale_factor": -1}, "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
"text_kwargs": {"padding": "max_length", "max_length": 76}, "text_kwargs": {"padding": "max_length", "max_length": 76},
} }
@ -268,7 +268,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Define the kwargs for each modality # Define the kwargs for each modality
all_kwargs = { all_kwargs = {
"common_kwargs": {"return_tensors": "pt"}, "common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"do_rescale": True, "rescale_factor": -1}, "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
"text_kwargs": {"padding": "max_length", "max_length": 76}, "text_kwargs": {"padding": "max_length", "max_length": 76},
} }

View File

@ -132,7 +132,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def test_image_processor_defaults_preserved_by_image_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self):
""" """
We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
We then check that the mean of the pixel_values is less than or equal to 0 after processing. We then check that the mean of the pixel_values is less than or equal to 0 after processing.
Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
""" """
@ -140,7 +140,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.skipTest(f"image_processor attribute not present in {self.processor_class}") self.skipTest(f"image_processor attribute not present in {self.processor_class}")
processor_components = self.prepare_components() processor_components = self.prepare_components()
processor_components["image_processor"] = self.get_component( processor_components["image_processor"] = self.get_component(
"image_processor", do_rescale=True, rescale_factor=-1 "image_processor", do_rescale=True, rescale_factor=-1.0
) )
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
@ -178,7 +178,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt")
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
def test_unstructured_kwargs(self): def test_unstructured_kwargs(self):
@ -193,7 +193,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
text=input_str, text=input_str,
return_tensors="pt", return_tensors="pt",
do_rescale=True, do_rescale=True,
rescale_factor=-1, rescale_factor=-1.0,
padding="max_length", padding="max_length",
max_length=76, max_length=76,
) )
@ -212,7 +212,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
images=image_input, images=image_input,
return_tensors="pt", return_tensors="pt",
do_rescale=True, do_rescale=True,
rescale_factor=-1, rescale_factor=-1.0,
padding="longest", padding="longest",
max_length=76, max_length=76,
) )
@ -230,7 +230,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = processor( _ = processor(
images=image_input, images=image_input,
images_kwargs={"do_rescale": True, "rescale_factor": -1}, images_kwargs={"do_rescale": True, "rescale_factor": -1.0},
do_rescale=True, do_rescale=True,
return_tensors="pt", return_tensors="pt",
) )
@ -247,7 +247,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Define the kwargs for each modality # Define the kwargs for each modality
all_kwargs = { all_kwargs = {
"common_kwargs": {"return_tensors": "pt"}, "common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"do_rescale": True, "rescale_factor": -1}, "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
"text_kwargs": {"padding": "max_length", "max_length": 76}, "text_kwargs": {"padding": "max_length", "max_length": 76},
} }
@ -267,7 +267,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Define the kwargs for each modality # Define the kwargs for each modality
all_kwargs = { all_kwargs = {
"common_kwargs": {"return_tensors": "pt"}, "common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"do_rescale": True, "rescale_factor": -1}, "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
"text_kwargs": {"padding": "max_length", "max_length": 76}, "text_kwargs": {"padding": "max_length", "max_length": 76},
} }

View File

@ -236,8 +236,8 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_inputs[0], image_inputs[0],
return_tensors="pt", return_tensors="pt",
input_data_format="channels_last", input_data_format="channels_last",
image_mean=0, image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=1, image_std=(1.0, 1.0, 1.0, 1.0),
).pixel_values ).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@ -247,8 +247,8 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_inputs, image_inputs,
return_tensors="pt", return_tensors="pt",
input_data_format="channels_last", input_data_format="channels_last",
image_mean=0, image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=1, image_std=(1.0, 1.0, 1.0, 1.0),
).pixel_values ).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

View File

@ -250,8 +250,8 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
video_inputs[0], video_inputs[0],
return_tensors="pt", return_tensors="pt",
input_data_format="channels_last", input_data_format="channels_last",
image_mean=0, image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=1, image_std=(1.0, 1.0, 1.0, 1.0),
)[self.input_name] )[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]]) expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@ -261,8 +261,8 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
video_inputs, video_inputs,
return_tensors="pt", return_tensors="pt",
input_data_format="channels_last", input_data_format="channels_last",
image_mean=0, image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=1, image_std=(1.0, 1.0, 1.0, 1.0),
)[self.input_name] )[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs) expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)

View File

@ -444,7 +444,7 @@ class JanusProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True, tokenize=True,
return_dict=True, return_dict=True,
do_rescale=True, do_rescale=True,
rescale_factor=-1, rescale_factor=-1.0,
return_tensors="np", return_tensors="np",
) )
self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0) self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)

View File

@ -100,7 +100,7 @@ class Lfm2VlProcessorTest(ProcessorTesterMixin, unittest.TestCase):
"{{'<|im_start|>assistant\n' }}" "{{'<|im_start|>assistant\n' }}"
"{% endif %}" "{% endif %}"
) )
return {"chat_template": chat_template, "use_image_special_tokens": True} return {"chat_template": chat_template}
# Override as Lfm2VL needs images/video to be an explicitly nested batch # Override as Lfm2VL needs images/video to be an explicitly nested batch
def prepare_image_inputs(self, batch_size=None): def prepare_image_inputs(self, batch_size=None):

View File

@ -386,7 +386,7 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
images=image_input, images=image_input,
return_tensors="pt", return_tensors="pt",
do_rescale=True, do_rescale=True,
rescale_factor=-1, rescale_factor=-1.0,
padding="longest", padding="longest",
max_length=76, max_length=76,
) )

Some files were not shown because too many files have changed in this diff Show More