🚨 [unbloating] unify TypedDict usage in processing (#40931)

* just squash commits into one * fix style
2025-10-20 17:13:56 +08:00 · 2025-10-03 14:17:59 +02:00
parent 42bcc81ba2
commit 5339f72b9b
208 changed files with 1578 additions and 1988 deletions
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -292,7 +292,7 @@ The `@auto_docstring` decorator automatically generates docstrings by:

 8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.

-    Currently only supported for [`FastImageProcessorKwargs`].
+    Currently only supported for [`ImagesKwargs`].

 ## Best practices

--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@ -20,7 +20,8 @@ import numpy as np

 from .image_processing_base import BatchFeature, ImageProcessingMixin
 from .image_transforms import center_crop, normalize, rescale
-from .image_utils import ChannelDimension, get_image_size
+from .image_utils import ChannelDimension, ImageInput, get_image_size
+from .processing_utils import ImagesKwargs, Unpack
 from .utils import logging
 from .utils.import_utils import requires

@ -36,6 +37,8 @@ INIT_SERVICE_KWARGS = [

@requires(backends=("vision",))
 class BaseImageProcessor(ImageProcessingMixin):
+    valid_kwargs = ImagesKwargs
+
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

@ -46,9 +49,9 @@ class BaseImageProcessor(ImageProcessingMixin):
        """
        return False

-    def __call__(self, images, **kwargs) -> BatchFeature:
+    def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
        """Preprocess an image or a batch of images."""
-        return self.preprocess(images, **kwargs)
+        return self.preprocess(images, *args, **kwargs)

    def preprocess(self, images, **kwargs) -> BatchFeature:
        raise NotImplementedError("Each image processor must implement its own preprocess method")
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@ -15,7 +15,7 @@
 from collections.abc import Iterable
 from copy import deepcopy
 from functools import lru_cache, partial
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, Optional, Union

 import numpy as np

@ -40,7 +40,7 @@ from .image_utils import (
    validate_kwargs,
    validate_preprocess_arguments,
 )
-from .processing_utils import Unpack
+from .processing_utils import ImagesKwargs, Unpack
 from .utils import (
    TensorType,
    auto_docstring,
@ -163,28 +163,6 @@ def divide_to_patches(
    return patches


-class DefaultFastImageProcessorKwargs(TypedDict, total=False):
-    do_resize: Optional[bool]
-    size: Optional[dict[str, int]]
-    default_to_square: Optional[bool]
-    resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
-    do_center_crop: Optional[bool]
-    crop_size: Optional[dict[str, int]]
-    do_rescale: Optional[bool]
-    rescale_factor: Optional[Union[int, float]]
-    do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
-    do_pad: Optional[bool]
-    pad_size: Optional[dict[str, int]]
-    do_convert_rgb: Optional[bool]
-    return_tensors: Optional[Union[str, TensorType]]
-    data_format: Optional[ChannelDimension]
-    input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional["torch.device"]
-    disable_grouping: Optional[bool]
-
-
@auto_docstring
 class BaseImageProcessorFast(BaseImageProcessor):
    resample = None
@ -206,10 +184,10 @@ class BaseImageProcessorFast(BaseImageProcessor):
    input_data_format = None
    device = None
    model_input_names = ["pixel_values"]
-    valid_kwargs = DefaultFastImageProcessorKwargs
+    valid_kwargs = ImagesKwargs
    unused_kwargs = None

-    def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
        super().__init__(**kwargs)
        kwargs = self.filter_out_unused_kwargs(kwargs)
        size = kwargs.pop("size", self.size)
@ -728,11 +706,8 @@ class BaseImageProcessorFast(BaseImageProcessor):
            data_format=data_format,
        )

-    def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
-        return self.preprocess(images, *args, **kwargs)
-
    @auto_docstring
-    def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
        # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
        # Set default kwargs from self. This ensures that if a kwarg is not provided
@ -765,7 +740,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
        do_convert_rgb: bool,
        input_data_format: ChannelDimension,
        device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
+        **kwargs: Unpack[ImagesKwargs],
    ) -> BatchFeature:
        """
        Preprocess image-like inputs.
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@ -959,8 +959,6 @@ class AriaProcessor(ProcessorMixin):
        self,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
        images: Optional[ImageInput] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[AriaProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@ -85,8 +85,6 @@ class AriaProcessor(ProcessorMixin):
        self,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
        images: Optional[ImageInput] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[AriaProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@ -19,18 +19,11 @@ import numpy as np

 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput


-class AyaVisionImagesKwargs(ImagesKwargs, total=False):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-
-
 class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: AyaVisionImagesKwargs
    _defaults = {
        "text_kwargs": {
            "padding_side": "left",
@ -140,8 +133,6 @@ class AyaVisionProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[AyaVisionProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@ -33,6 +33,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    filter_out_non_signature_kwargs,
@ -54,6 +55,17 @@ if is_torch_available():
 logger = logging.get_logger(__name__)


+class BeitImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: Optional[bool]
+
+
@requires(backends=("vision",))
 class BeitImageProcessor(BaseImageProcessor):
    r"""
@ -99,6 +111,7 @@ class BeitImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = BeitImageProcessorKwargs

    @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS)
    def __init__(
--- a/src/transformers/models/beit/image_processing_beit_fast.py
+++ b/src/transformers/models/beit/image_processing_beit_fast.py
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -40,17 +39,7 @@ from ...utils import (
    TensorType,
    auto_docstring,
 )
-
-
-class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: Optional[bool]
+from .image_processing_beit import BeitImageProcessorKwargs


@auto_docstring
@ -66,9 +55,9 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
    do_rescale = True
    do_normalize = True
    do_reduce_labels = False
-    valid_kwargs = BeitFastImageProcessorKwargs
+    valid_kwargs = BeitImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[BeitImageProcessorKwargs]):
        super().__init__(**kwargs)

    def reduce_label(self, labels: list["torch.Tensor"]):
@ -86,7 +75,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
        self,
        images: ImageInput,
        segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[BeitFastImageProcessorKwargs],
+        **kwargs: Unpack[BeitImageProcessorKwargs],
    ) -> BatchFeature:
        r"""
        segmentation_maps (`ImageInput`, *optional*):
@ -101,7 +90,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
        do_convert_rgb: bool,
        input_data_format: ChannelDimension,
        device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[BeitFastImageProcessorKwargs],
+        **kwargs: Unpack[BeitImageProcessorKwargs],
    ) -> BatchFeature:
        """
        Preprocess image-like inputs.
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@ -36,7 +36,6 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False):
            "return_length": False,
            "verbose": True,
        },
-        "images_kwargs": {},
    }


@ -67,8 +66,6 @@ class BlipProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[BlipProcessorKwargs],
    ) -> BatchEncoding:
        """
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@ -41,7 +41,6 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
            "return_length": False,
            "verbose": True,
        },
-        "images_kwargs": {},
    }


@ -81,8 +80,6 @@ class Blip2Processor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[Blip2ProcessorKwargs],
    ) -> BatchEncoding:
        """
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@ -35,6 +35,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging


@ -122,6 +123,10 @@ def get_resize_output_image_size(
    return new_height, new_width


+class BridgeTowerImageProcessorKwargs(ImagesKwargs):
+    size_divisor: Optional[int]
+
+
 class BridgeTowerImageProcessor(BaseImageProcessor):
    r"""
    Constructs a BridgeTower image processor.
@ -169,6 +174,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = BridgeTowerImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py
@ -23,7 +23,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
    BatchFeature,
-    DefaultFastImageProcessorKwargs,
    ImageInput,
    SizeDict,
    TensorType,
@ -33,6 +32,7 @@ from ...image_processing_utils_fast import (
 )
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
 from ...utils import auto_docstring
+from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs


 def make_pixel_mask(
@ -85,17 +85,6 @@ def get_resize_output_image_size(
    return new_height, new_width


-class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        size_divisor (`int`, *optional*, defaults to 32):
-            The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
-            is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
-    """
-
-    size_divisor: Optional[int]
-
-
@auto_docstring
 class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BICUBIC
@ -110,14 +99,14 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
    do_normalize = True
    do_pad = True
    size_divisor = 32
-    valid_kwargs = BridgeTowerFastImageProcessorKwargs
+    valid_kwargs = BridgeTowerImageProcessorKwargs
    model_input_names = ["pixel_values", "pixel_mask"]

-    def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def resize(
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@ -16,17 +16,10 @@
 Processor class for BridgeTower.
 """

-from typing import Optional
-
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-
-
-class BridgeTowerImagesKwargs(ImagesKwargs):
-    size_divisor: Optional[int]
+from ...processing_utils import ProcessingKwargs, ProcessorMixin


 class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: BridgeTowerImagesKwargs
    _defaults = {
        "text_kwargs": {
            "add_special_tokens": True,
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@ -92,8 +92,6 @@ class ChameleonProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[ChameleonProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
+++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@ -27,18 +27,13 @@ import torch
 from torchvision.transforms.v2 import functional as F

 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    group_images_by_shape,
-    reorder_images,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring


-class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
    """
    crop_to_patches (`bool`, *optional*, defaults to `False`):
        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
--- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@ -30,8 +30,10 @@ from transformers.models.aya_vision.modeling_aya_vision import (
 from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast

 from ...cache_utils import Cache
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
 from ...utils.generic import check_model_inputs
 from .configuration_cohere2_vision import Cohere2VisionConfig
@ -301,6 +303,24 @@ def get_optimal_tiled_canvas(
    return best_grid


+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
@auto_docstring
 class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
    size = {"height": 512, "width": 512}
@ -308,6 +328,14 @@ class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
    max_patches = 12
    crop_to_patches = True
    patch_size = 16
+    valid_kwargs = Cohere2VisionFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)


 __all__ = [
--- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@ -19,16 +19,11 @@ import numpy as np

 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput


-class Cohere2VisionImagesKwargs(ImagesKwargs, total=False):
-    max_patches: Optional[int]
-
-
 class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Cohere2VisionImagesKwargs
    _defaults = {
        "text_kwargs": {
            "padding_side": "left",
--- a/src/transformers/models/colpali/modular_colpali.py
+++ b/src/transformers/models/colpali/modular_colpali.py
@ -90,8 +90,6 @@ class ColPaliProcessor(PaliGemmaProcessor):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[ColPaliProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@ -131,8 +131,6 @@ class ColPaliProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[ColPaliProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@ -93,8 +93,6 @@ class ColQwen2Processor(ColPaliProcessor):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[ColQwen2ProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@ -94,8 +94,6 @@ class ColQwen2Processor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[ColQwen2ProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@ -53,6 +53,7 @@ from ...image_utils import (
    validate_kwargs,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    is_scipy_available,
@ -774,6 +775,29 @@ def compute_segments(
    return segmentation, segments


+class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
@requires(backends=("vision",))
 class ConditionalDetrImageProcessor(BaseImageProcessor):
    r"""
@ -829,6 +853,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = ConditionalDetrImageProcessorKwargs

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
    def __init__(
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
@ -15,7 +15,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    SizeDict,
    get_image_size_for_max_height_width,
    get_max_height_width,
@ -37,6 +36,7 @@ from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.import_utils import requires
 from .image_processing_conditional_detr import (
+    ConditionalDetrImageProcessorKwargs,
    compute_segments,
    convert_segmentation_to_rle,
    get_size_with_aspect_ratio,
@ -46,24 +46,6 @@ from .image_processing_conditional_detr import (

 logger = logging.get_logger(__name__)

-
-class ConditionalDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


@ -278,9 +260,9 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
    size = {"shortest_edge": 800, "longest_edge": 1333}
    default_to_square = False
    model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = ConditionalDetrFastImageProcessorKwargs
+    valid_kwargs = ConditionalDetrImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> None:
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")

@ -542,25 +524,8 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
    def preprocess(
        self,
        images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs],
+        **kwargs: Unpack[ConditionalDetrImageProcessorKwargs],
    ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
            logger.warning_once(
@ -575,7 +540,7 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
            )
            kwargs["size"] = kwargs.pop("max_size")

-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)

    def _preprocess(
        self,
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@ -38,6 +38,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 from ...utils.import_utils import requires

@ -49,6 +50,16 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class ConvNextImageProcessorKwargs(ImagesKwargs):
+    """
+    crop_pct (`float`, *optional*):
+        Percentage of the image to crop. Only has an effect if size < 384. Can be
+        overridden by `crop_pct` in the`preprocess` method.
+    """
+
+    crop_pct: Optional[float]
+
+
@requires(backends=("vision",))
 class ConvNextImageProcessor(BaseImageProcessor):
    r"""
@ -87,6 +98,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = ConvNextImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/convnext/image_processing_convnext_fast.py
+++ b/src/transformers/models/convnext/image_processing_convnext_fast.py
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -39,16 +38,7 @@ from ...utils import (
    TensorType,
    auto_docstring,
 )
-
-
-class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    crop_pct (`float`, *optional*):
-        Percentage of the image to crop. Only has an effect if size < 384. Can be
-        overridden by `crop_pct` in the`preprocess` method.
-    """
-
-    crop_pct: Optional[float]
+from .image_processing_convnext import ConvNextImageProcessorKwargs


@auto_docstring
@ -62,13 +52,13 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
    do_rescale = True
    do_normalize = True
    crop_pct = 224 / 256
-    valid_kwargs = ConvNextFastImageProcessorKwargs
+    valid_kwargs = ConvNextImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[ConvNextImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def resize(
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@ -246,9 +246,7 @@ class CsmProcessor(ProcessorMixin):

        text_kwargs = output_kwargs["text_kwargs"]
        audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
-
-        return_tensors = common_kwargs.pop("return_tensors", None)
+        return_tensors = text_kwargs.get("return_tensors", None)
        if return_tensors != "pt":
            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")

--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@ -38,6 +38,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging


@ -48,6 +49,16 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class DeepseekVLImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 class DeepseekVLImageProcessor(BaseImageProcessor):
    r"""
    Constructs a DEEPSEEK_VL image processor.
@ -90,6 +101,8 @@ class DeepseekVLImageProcessor(BaseImageProcessor):

    model_input_names = ["pixel_values"]

+    valid_kwargs = DeepseekVLImageProcessorKwargs
+
    def __init__(
        self,
        do_resize: bool = True,
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
@ -24,25 +24,11 @@ import torch
 import torch.nn.functional as F

 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    group_images_by_shape,
-    reorder_images,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
+from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs


@auto_docstring
@ -56,9 +42,9 @@ class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
    do_rescale = True
    do_normalize = True
    do_pad = True
-    valid_kwargs = DeepseekVLFastImageProcessorKwargs
+    valid_kwargs = DeepseekVLImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DeepseekVLImageProcessorKwargs]):
        super().__init__(**kwargs)
        if kwargs.get("image_mean") is None:
            background_color = (127, 127, 127)
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@ -39,6 +39,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging


@ -49,6 +50,32 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: "PILImageResampling"
+    high_res_image_mean: list[float]
+    high_res_image_std: list[float]
+
+
 class DeepseekVLHybridImageProcessor(BaseImageProcessor):
    r"""
    Constructs a DEEPSEEK_VL_HYBRID image processor.
@ -102,6 +129,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "high_res_pixel_values"]
+    valid_kwargs = DeepseekVLHybridImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
@ -26,7 +26,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
    BatchFeature,
-    DefaultFastImageProcessorKwargs,
    get_size_dict,
    group_images_by_shape,
    reorder_images,
@ -41,32 +40,7 @@ from ...image_utils import (
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
-        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
-        method.
-    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
-        overridden by the `high_res_resample` parameter in the `preprocess` method.
-    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
-        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
-        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
-    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
-        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
-        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
-    """
-
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs


@auto_docstring
@ -80,14 +54,14 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
    do_rescale = True
    do_normalize = True
    do_pad = True
-    valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs
+    valid_kwargs = DeepseekVLHybridImageProcessorKwargs
    high_res_image_mean = OPENAI_CLIP_MEAN
    high_res_image_std = OPENAI_CLIP_STD
    high_res_size = {"height": 1024, "width": 1024}
    high_res_resample = PILImageResampling.BICUBIC
    model_input_names = ["pixel_values", "high_res_pixel_values"]

-    def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
        if kwargs.get("image_mean") is None:
            background_color = (127, 127, 127)
        else:
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@ -22,7 +22,6 @@ from ...cache_utils import Cache
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
    BatchFeature,
-    DefaultFastImageProcessorKwargs,
    get_size_dict,
    group_images_by_shape,
    reorder_images,
@ -43,7 +42,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...tokenization_utils_base import (
    PreTokenizedInput,
    TextInput,
@ -430,6 +429,32 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
        return model_inputs


+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: "PILImageResampling"
+    high_res_image_mean: list[float]
+    high_res_image_std: list[float]
+
+
 class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
    r"""
    Constructs a DEEPSEEK_VL_HYBRID image processor.
@ -483,6 +508,7 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
    """

    model_input_names = ["pixel_values", "high_res_pixel_values"]
+    valid_kwargs = DeepseekVLHybridImageProcessorKwargs

    def __init__(
        self,
@ -727,32 +753,6 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
        return BatchFeature(data=data, tensor_type=return_tensors)


-class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
-        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
-        method.
-    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
-        overridden by the `high_res_resample` parameter in the `preprocess` method.
-    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
-        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
-        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
-    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
-        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
-        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
-    """
-
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
-
-
 class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
    high_res_image_mean = OPENAI_CLIP_MEAN
    high_res_image_std = OPENAI_CLIP_STD
@ -760,7 +760,7 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
    high_res_resample = PILImageResampling.BICUBIC
    model_input_names = ["pixel_values", "high_res_pixel_values"]

-    def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
        if kwargs.get("image_mean") is None:
            background_color = (127, 127, 127)
        else:
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@ -53,6 +53,7 @@ from ...image_utils import (
    validate_kwargs,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    is_scipy_available,
@ -79,6 +80,30 @@ if is_scipy_available():

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

+
+class DeformableDetrImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


@ -827,6 +852,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = DeformableDetrImageProcessorKwargs

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
    def __init__(
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@ -14,7 +14,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    SizeDict,
    get_image_size_for_max_height_width,
    get_max_height_width,
@ -35,29 +34,11 @@ from ...image_utils import (
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.import_utils import requires
-from .image_processing_deformable_detr import get_size_with_aspect_ratio
+from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs, get_size_with_aspect_ratio


 logger = logging.get_logger(__name__)

-
-class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


@ -272,9 +253,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
    size = {"shortest_edge": 800, "longest_edge": 1333}
    default_to_square = False
    model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = DeformableDetrFastImageProcessorKwargs
+    valid_kwargs = DeformableDetrImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None:
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")

@ -536,25 +517,8 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
    def preprocess(
        self,
        images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
+        **kwargs: Unpack[DeformableDetrImageProcessorKwargs],
    ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
            logger.warning_once(
@ -569,7 +533,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
            )
            kwargs["size"] = kwargs.pop("max_size")

-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)

    def _preprocess(
        self,
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@ -52,6 +52,7 @@ from ...image_utils import (
    validate_kwargs,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    is_scipy_available,
@ -82,6 +83,29 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


+class DetrImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 # From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
    """
@ -811,6 +835,7 @@ class DetrImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = DetrImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@ -28,7 +28,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    SizeDict,
    get_image_size_for_max_height_width,
    get_max_height_width,
@ -54,6 +53,7 @@ from ...utils import (
 )
 from ...utils.import_utils import requires
 from .image_processing_detr import (
+    DetrImageProcessorKwargs,
    compute_segments,
    convert_segmentation_to_rle,
    get_size_with_aspect_ratio,
@ -263,23 +263,6 @@ def prepare_coco_panoptic_annotation(
    return new_target


-class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
@auto_docstring
@requires(backends=("torchvision", "torch"))
 class DetrImageProcessorFast(BaseImageProcessorFast):
@ -294,9 +277,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
    size = {"shortest_edge": 800, "longest_edge": 1333}
    default_to_square = False
    model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = DetrFastImageProcessorKwargs
+    valid_kwargs = DetrImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None:
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")

@ -558,25 +541,8 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
    def preprocess(
        self,
        images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[DetrFastImageProcessorKwargs],
+        **kwargs: Unpack[DetrImageProcessorKwargs],
    ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
            logger.warning_once(
@ -591,7 +557,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
            )
            kwargs["size"] = kwargs.pop("max_size")

-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)

    def _preprocess(
        self,
--- a/src/transformers/models/dia/processing_dia.py
+++ b/src/transformers/models/dia/processing_dia.py
@ -111,9 +111,7 @@ class DiaProcessor(ProcessorMixin):

        text_kwargs = output_kwargs["text_kwargs"]
        audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
-
-        return_tensors = common_kwargs.pop("return_tensors", None)
+        return_tensors = text_kwargs.get("return_tensors", None)
        if return_tensors != "pt":
            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")

--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@ -40,6 +40,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 from ...utils.import_utils import is_vision_available, requires

@ -51,6 +52,18 @@ if is_vision_available():
    import PIL


+class DonutImageProcessorKwargs(ImagesKwargs):
+    """
+    do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
+        Whether to resize the image using thumbnail method.
+    do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
+        Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+    """
+
+    do_thumbnail: Optional[bool]
+    do_align_long_axis: Optional[bool]
+
+
@requires(backends=("vision",))
 class DonutImageProcessor(BaseImageProcessor):
    r"""
@ -90,6 +103,7 @@ class DonutImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = DonutImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/donut/image_processing_donut_fast.py
+++ b/src/transformers/models/donut/image_processing_donut_fast.py
@ -19,7 +19,7 @@ from typing import Optional, Union
 import torch
 from torchvision.transforms.v2 import functional as F

-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@ -28,24 +28,12 @@ from ...utils import (
    auto_docstring,
    logging,
 )
+from .image_processing_donut import DonutImageProcessorKwargs


 logger = logging.get_logger(__name__)


-class DonutFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
-            Whether to resize the image using thumbnail method.
-        do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
-            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
-    """
-
-    do_thumbnail: Optional[bool]
-    do_align_long_axis: Optional[bool]
-
-
@auto_docstring
 class DonutImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BILINEAR
@ -58,9 +46,9 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
    do_thumbnail = True
    do_align_long_axis = False
    do_pad = True
-    valid_kwargs = DonutFastImageProcessorKwargs
+    valid_kwargs = DonutImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DonutImageProcessorKwargs]):
        size = kwargs.pop("size", None)
        if isinstance(size, (tuple, list)):
            size = size[::-1]
@ -68,7 +56,7 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutImageProcessorKwargs]) -> BatchFeature:
        if "size" in kwargs:
            size = kwargs.pop("size")
            if isinstance(size, (tuple, list)):
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@ -74,8 +74,6 @@ class DonutProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[DonutProcessorKwargs],
    ):
        """
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@ -44,6 +44,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    filter_out_non_signature_kwargs,
@ -63,6 +64,26 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class DPTImageProcessorKwargs(ImagesKwargs):
+    """
+    ensure_multiple_of (`int`, *optional*, defaults to 1):
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
+        by `ensure_multiple_of` in `preprocess`.
+    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
+        be overridden by `keep_aspect_ratio` in `preprocess`.
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    ensure_multiple_of: Optional[int]
+    size_divisor: Optional[int]
+    keep_aspect_ratio: Optional[bool]
+    do_reduce_labels: Optional[bool]
+
+
 def get_resize_output_image_size(
    input_image: np.ndarray,
    output_size: Union[int, Iterable[int]],
@ -151,6 +172,7 @@ class DPTImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = DPTImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/dpt/image_processing_dpt_fast.py
+++ b/src/transformers/models/dpt/image_processing_dpt_fast.py
@ -28,7 +28,7 @@ import torch
 from torchvision.transforms.v2 import functional as F

 from ...image_processing_base import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
@ -41,35 +41,13 @@ from ...image_utils import (
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, requires_backends
+from .image_processing_dpt import DPTImageProcessorKwargs


 if TYPE_CHECKING:
    from ...modeling_outputs import DepthEstimatorOutput


-class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    ensure_multiple_of (`int`, *optional*, defaults to 1):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
-        by `ensure_multiple_of` in `preprocess`.
-    size_divisor (`int`, *optional*):
-        If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
-        DINOv2 paper, which uses the model in combination with DPT.
-    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
-        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-        be overridden by `keep_aspect_ratio` in `preprocess`.
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    ensure_multiple_of: Optional[int]
-    size_divisor: Optional[int]
-    keep_aspect_ratio: Optional[bool]
-    do_reduce_labels: Optional[bool]
-
-
 def get_resize_output_image_size(
    input_image: "torch.Tensor",
    output_size: Union[int, Iterable[int]],
@ -123,13 +101,13 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
    do_normalize = True
    do_reduce_labels = None

-    valid_kwargs = DPTFastImageProcessorKwargs
+    valid_kwargs = DPTImageProcessorKwargs
    do_pad = False
    rescale_factor = 1 / 255
    ensure_multiple_of = 1
    keep_aspect_ratio = False

-    def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DPTImageProcessorKwargs]):
        super().__init__(**kwargs)

    def reduce_label(self, labels: list["torch.Tensor"]):
@ -147,7 +125,7 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
        self,
        images: ImageInput,
        segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[DPTFastImageProcessorKwargs],
+        **kwargs: Unpack[DPTImageProcessorKwargs],
    ) -> BatchFeature:
        r"""
        segmentation_maps (`ImageInput`, *optional*):
@ -162,7 +140,7 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
        do_convert_rgb: bool,
        input_data_format: ChannelDimension,
        device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[DPTFastImageProcessorKwargs],
+        **kwargs: Unpack[DPTImageProcessorKwargs],
    ) -> BatchFeature:
        """
        Preprocess image-like inputs.
--- a/src/transformers/models/dpt/modular_dpt.py
+++ b/src/transformers/models/dpt/modular_dpt.py
@ -21,7 +21,7 @@ from typing import TYPE_CHECKING, Optional, Union
 import torch

 from ...image_processing_base import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
@ -35,6 +35,7 @@ from ...utils import (
    requires_backends,
 )
 from ..beit.image_processing_beit_fast import BeitImageProcessorFast
+from .image_processing_dpt import DPTImageProcessorKwargs


 if TYPE_CHECKING:
@ -82,29 +83,6 @@ def get_resize_output_image_size(
    return SizeDict(height=new_height, width=new_width)


-class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    ensure_multiple_of (`int`, *optional*, defaults to 1):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
-        by `ensure_multiple_of` in `preprocess`.
-    size_divisor (`int`, *optional*):
-        If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
-        DINOv2 paper, which uses the model in combination with DPT.
-    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
-        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-        be overridden by `keep_aspect_ratio` in `preprocess`.
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    ensure_multiple_of: Optional[int]
-    size_divisor: Optional[int]
-    keep_aspect_ratio: Optional[bool]
-    do_reduce_labels: Optional[bool]
-
-
@auto_docstring
 class DPTImageProcessorFast(BeitImageProcessorFast):
    resample = PILImageResampling.BICUBIC
@ -123,7 +101,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
    do_center_crop = None
    do_reduce_labels = None

-    valid_kwargs = DPTFastImageProcessorKwargs
+    valid_kwargs = DPTImageProcessorKwargs

    def resize(
        self,
--- a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
@ -34,6 +34,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, logging, requires_backends


@ -49,6 +50,15 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_grayscale (`bool`, *optional*, defaults to `True`):
+        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    do_grayscale: Optional[bool] = True
+
+
 # Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
 def is_grayscale(
    image: np.ndarray,
@ -155,6 +165,7 @@ class EfficientLoFTRImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = EfficientLoFTRImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
+++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
@ -22,7 +22,6 @@ from PIL import Image, ImageDraw
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -40,6 +39,7 @@ from ...utils import (
    TensorType,
    auto_docstring,
 )
+from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs


 if TYPE_CHECKING:
@ -108,15 +108,6 @@ def convert_to_grayscale(
    return F.rgb_to_grayscale(image, num_output_channels=3)


-class EfficientLoFTRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `True`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: Optional[bool] = True
-
-
@auto_docstring
 class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BILINEAR
@ -126,13 +117,13 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
    do_rescale = True
    rescale_factor = 1 / 255
    do_normalize = None
-    valid_kwargs = EfficientLoFTRFastImageProcessorKwargs
+    valid_kwargs = EfficientLoFTRImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def _prepare_images_structure(
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@ -33,6 +33,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging


@ -43,6 +44,18 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class EfficientNetImageProcessorKwargs(ImagesKwargs):
+    """
+    rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
+        Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
+    include_top (`bool`, *optional*, defaults to `self.include_top`):
+        Normalize the image again with the standard deviation only for image classification if set to True.
+    """
+
+    rescale_offset: bool
+    include_top: bool
+
+
 class EfficientNetImageProcessor(BaseImageProcessor):
    r"""
    Constructs a EfficientNet image processor.
@ -83,6 +96,7 @@ class EfficientNetImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = EfficientNetImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
@ -20,7 +20,7 @@ from typing import Optional, Union
 import torch
 from torchvision.transforms.v2 import functional as F

-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@ -28,19 +28,7 @@ from ...utils import (
    TensorType,
    auto_docstring,
 )
-
-
-class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
-            Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
-        include_top (`bool`, *optional*, defaults to `self.include_top`):
-            Normalize the image again with the standard deviation only for image classification if set to True.
-    """
-
-    rescale_offset: bool
-    include_top: bool
+from .image_processing_efficientnet import EfficientNetImageProcessorKwargs


@auto_docstring
@ -57,9 +45,9 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast):
    rescale_offset = False
    do_normalize = True
    include_top = True
-    valid_kwargs = EfficientNetFastImageProcessorKwargs
+    valid_kwargs = EfficientNetImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[EfficientNetImageProcessorKwargs]):
        super().__init__(**kwargs)

    def rescale(
@ -195,7 +183,7 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast):
        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)


--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@ -37,6 +37,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging


@ -46,6 +47,11 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class Emu3ImageProcessorKwargs(ImagesKwargs):
+    ratio: Optional[str]
+    image_area: Optional[int]
+
+
 def smart_resize(
    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
 ):
@ -108,6 +114,7 @@ class Emu3ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "image_sizes"]
+    valid_kwargs = Emu3ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@ -20,7 +20,7 @@ import numpy as np

 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available

@ -33,14 +33,8 @@ class Emu3TextKwargs(TextKwargs, total=False):
    return_for_image_generation: bool


-class Emu3ImagesKwargs(ImagesKwargs, total=False):
-    ratio: str
-    image_area: int
-
-
 class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
    text_kwargs: Emu3TextKwargs
-    images_kwargs: Emu3ImagesKwargs
    _defaults = {
        "text_kwargs": {
            "return_for_image_generation": False,
@ -95,8 +89,6 @@ class Emu3Processor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[Emu3ProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/eomt/image_processing_eomt.py
+++ b/src/transformers/models/eomt/image_processing_eomt.py
@ -36,6 +36,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
@ -53,6 +54,21 @@ if is_torch_available():
    import torch.nn.functional as F


+class EomtImageProcessorKwargs(ImagesKwargs):
+    """
+    do_split_image (`bool`, *optional*, defaults to `False`):
+        Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
+        input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
+        Otherwise, the input images will be padded to the target size.
+    ignore_index (`int`, *optional*):
+        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+        denoted with 0 (background) will be replaced with `ignore_index`.
+    """
+
+    do_split_image: bool
+    ignore_index: Optional[int] = None
+
+
 # Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
 def convert_segmentation_map_to_binary_masks(
    segmentation_map: np.ndarray,
--- a/src/transformers/models/eomt/image_processing_eomt_fast.py
+++ b/src/transformers/models/eomt/image_processing_eomt_fast.py
@ -24,7 +24,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -43,6 +42,7 @@ from ...utils import (
    filter_out_non_signature_kwargs,
 )
 from .image_processing_eomt import (
+    EomtImageProcessorKwargs,
    compute_segments,
    convert_segmentation_map_to_binary_masks,
    get_size_with_aspect_ratio,
@ -50,25 +50,6 @@ from .image_processing_eomt import (
 )


-class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_split_image (`bool`, *optional*, defaults to `False`):
-            Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
-            input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
-            Otherwise, the input images will be padded to the target size.
-    do_pad (`bool`, *optional*, defaults to `False`):
-            Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
-            number of patches in the batch. Padding will be applied to the bottom and right with zeros.
-    ignore_index (`int`, *optional*):
-            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-            denoted with 0 (background) will be replaced with `ignore_index`.
-    """
-
-    do_split_image: bool
-    do_pad: bool
-    ignore_index: Optional[int] = None
-
-
 def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
    """Returns the height and width from a size dict."""
    target_height = size_dict["shortest_edge"]
@ -102,9 +83,9 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
    do_split_image = False
    do_pad = False
    ignore_index = None
-    valid_kwargs = EomtImageProcessorFastKwargs
+    valid_kwargs = EomtImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]):
+    def __init__(self, **kwargs: Unpack[EomtImageProcessorKwargs]):
        super().__init__(**kwargs)

    def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]:
@ -153,7 +134,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
        images: ImageInput,
        segmentation_maps: Optional[list[torch.Tensor]] = None,
        instance_id_to_semantic_id: Optional[dict[int, int]] = None,
-        **kwargs: Unpack[EomtImageProcessorFastKwargs],
+        **kwargs: Unpack[EomtImageProcessorKwargs],
    ) -> BatchFeature:
        r"""
        segmentation_maps (`ImageInput`, *optional*):
@ -171,7 +152,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
        do_convert_rgb: bool,
        input_data_format: ChannelDimension,
        device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[EomtImageProcessorFastKwargs],
+        **kwargs: Unpack[EomtImageProcessorKwargs],
    ) -> BatchFeature:
        """
        Preprocess image-like inputs.
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@ -37,6 +37,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 from ...utils.import_utils import requires

@ -56,6 +57,89 @@ FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
 LOGIT_LAPLACE_EPS: float = 0.1


+class FlavaImageProcessorKwargs(ImagesKwargs):
+    """
+    return_image_mask (`bool`, *optional*, defaults to `False`):
+        Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+    input_size_patches (`int`, *optional*, defaults to 14):
+        Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+        by the `input_size_patches` parameter in `preprocess`.
+    total_mask_patches (`int`, *optional*, defaults to 75):
+        Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+        `preprocess`.
+    mask_group_min_patches (`int`, *optional*, defaults to 16):
+        Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+        parameter in `preprocess`.
+    mask_group_max_patches (`int`, *optional*):
+        Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+        parameter in `preprocess`.
+    mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+        Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+        in `preprocess`.
+    mask_group_max_aspect_ratio (`float`, *optional*):
+        Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+        in `preprocess`.
+    return_codebook_pixels (`bool`, *optional*, defaults to `False`):
+        Whether to return the codebook pixel values.
+    codebook_do_resize (`bool`, *optional*, defaults to `True`):
+        Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+        parameter in `preprocess`. `codebook_size`.
+    codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+        Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+        `preprocess`.
+    codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+        Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+        parameter in `preprocess`.
+    codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+        Whether to crop the input for codebook at the center. If the input size is smaller than
+        `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+        overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+    codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+        Desired output size for codebook input when applying center-cropping. Can be overridden by the
+        `codebook_crop_size` parameter in `preprocess`.
+    codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+        Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+        overridden by the `codebook_do_rescale` parameter in `preprocess`.
+    codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+        Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+        `codebook_rescale_factor` parameter in `preprocess`.
+    codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+        Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+        `codebook_do_map_pixels` parameter in `preprocess`.
+    codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+        Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+        be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+    codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+        The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+        by the `codebook_image_mean` parameter in `preprocess`.
+    codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+        The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+        be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """
+
+    # Mask related params
+    return_image_mask: Optional[bool]
+    input_size_patches: Optional[int]
+    total_mask_patches: Optional[int]
+    mask_group_min_patches: Optional[int]
+    mask_group_max_patches: Optional[int]
+    mask_group_min_aspect_ratio: Optional[float]
+    mask_group_max_aspect_ratio: Optional[float]
+    # Codebook related params
+    return_codebook_pixels: Optional[bool]
+    codebook_do_resize: Optional[bool]
+    codebook_size: Optional[bool]
+    codebook_resample: Optional[int]
+    codebook_do_center_crop: Optional[bool]
+    codebook_crop_size: Optional[int]
+    codebook_do_rescale: Optional[bool]
+    codebook_rescale_factor: Optional[Union[int, float]]
+    codebook_do_map_pixels: Optional[bool]
+    codebook_do_normalize: Optional[bool]
+    codebook_image_mean: Optional[Union[float, Iterable[float]]]
+    codebook_image_std: Optional[Union[float, Iterable[float]]]
+
+
 # Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
 class FlavaMaskingGenerator:
    def __init__(
@ -225,6 +309,7 @@ class FlavaImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = FlavaImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/flava/image_processing_flava_fast.py
+++ b/src/transformers/models/flava/image_processing_flava_fast.py
@ -16,7 +16,6 @@

 import math
 import random
-from collections.abc import Iterable
 from functools import lru_cache
 from typing import Any, Optional, Union

@ -26,7 +25,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
    BatchFeature,
-    DefaultFastImageProcessorKwargs,
    get_size_dict,
 )
 from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
@ -42,6 +40,7 @@ from .image_processing_flava import (
    FLAVA_IMAGE_MEAN,
    FLAVA_IMAGE_STD,
    LOGIT_LAPLACE_EPS,
+    FlavaImageProcessorKwargs,
 )


@ -121,90 +120,6 @@ class FlavaMaskingGenerator:
        return mask


-class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        return_image_mask (`bool`, *optional*, defaults to `False`):
-            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
-        input_size_patches (`int`, *optional*, defaults to 14):
-            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
-            by the `input_size_patches` parameter in `preprocess`.
-        total_mask_patches (`int`, *optional*, defaults to 75):
-            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
-            `preprocess`.
-        mask_group_min_patches (`int`, *optional*, defaults to 16):
-            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
-            parameter in `preprocess`.
-        mask_group_max_patches (`int`, *optional*):
-            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
-            parameter in `preprocess`.
-        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
-            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
-            in `preprocess`.
-        mask_group_max_aspect_ratio (`float`, *optional*):
-            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
-            in `preprocess`.
-        return_codebook_pixels (`bool`, *optional*, defaults to `False`):
-            Whether to return the codebook pixel values.
-        codebook_do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
-            parameter in `preprocess`. `codebook_size`.
-        codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
-            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
-            `preprocess`.
-        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
-            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
-            parameter in `preprocess`.
-        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to crop the input for codebook at the center. If the input size is smaller than
-            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
-            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
-        codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
-            Desired output size for codebook input when applying center-cropping. Can be overridden by the
-            `codebook_crop_size` parameter in `preprocess`.
-        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
-            overridden by the `codebook_do_rescale` parameter in `preprocess`.
-        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
-            `codebook_rescale_factor` parameter in `preprocess`.
-        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
-            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
-            `codebook_do_map_pixels` parameter in `preprocess`.
-        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
-            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
-        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
-            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
-            by the `codebook_image_mean` parameter in `preprocess`.
-        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
-            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
-            be overridden by the `codebook_image_std` parameter in `preprocess`.
-    """
-
-    # Mask related params
-    return_image_mask: Optional[bool]
-    input_size_patches: Optional[int]
-    total_mask_patches: Optional[int]
-    mask_group_min_patches: Optional[int]
-    mask_group_max_patches: Optional[int]
-    mask_group_min_aspect_ratio: Optional[float]
-    mask_group_max_aspect_ratio: Optional[float]
-    # Codebook related params
-    return_codebook_pixels: Optional[bool]
-    codebook_do_resize: Optional[bool]
-    codebook_size: Optional[bool]
-    codebook_resample: Optional[int]
-    codebook_do_center_crop: Optional[bool]
-    codebook_crop_size: Optional[int]
-    codebook_do_rescale: Optional[bool]
-    codebook_rescale_factor: Optional[Union[int, float]]
-    codebook_do_map_pixels: Optional[bool]
-    codebook_do_normalize: Optional[bool]
-    codebook_image_mean: Optional[Union[float, Iterable[float]]]
-    codebook_image_std: Optional[Union[float, Iterable[float]]]
-
-
@auto_docstring
 class FlavaImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BICUBIC
@ -239,13 +154,13 @@ class FlavaImageProcessorFast(BaseImageProcessorFast):
    codebook_do_normalize = True
    codebook_image_mean = FLAVA_CODEBOOK_MEAN
    codebook_image_std = FLAVA_CODEBOOK_STD
-    valid_kwargs = FlavaFastImageProcessorKwargs
+    valid_kwargs = FlavaImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[FlavaImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[FlavaImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    @classmethod
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@ -17,39 +17,8 @@ Image/Text processor class for FLAVA
 """

 import warnings
-from collections.abc import Iterable
-from typing import Optional, Union

-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-
-
-class FlavaImagesKwargs(ImagesKwargs):
-    # Mask related params
-    return_image_mask: Optional[bool]
-    input_size_patches: Optional[int]
-    total_mask_patches: Optional[int]
-    mask_group_min_patches: Optional[int]
-    mask_group_max_patches: Optional[int]
-    mask_group_min_aspect_ratio: Optional[float]
-    mask_group_max_aspect_ratio: Optional[float]
-    # Codebook related params
-    return_codebook_pixels: Optional[bool]
-    codebook_do_resize: Optional[bool]
-    codebook_size: Optional[bool]
-    codebook_resample: Optional[int]
-    codebook_do_center_crop: Optional[bool]
-    codebook_crop_size: Optional[int]
-    codebook_do_rescale: Optional[bool]
-    codebook_rescale_factor: Optional[Union[int, float]]
-    codebook_do_map_pixels: Optional[bool]
-    codebook_do_normalize: Optional[bool]
-    codebook_image_mean: Optional[Union[float, Iterable[float]]]
-    codebook_image_std: Optional[Union[float, Iterable[float]]]
-
-
-class FlavaProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: FlavaImagesKwargs
-    _defaults = {}
+from ...processing_utils import ProcessorMixin


 class FlavaProcessor(ProcessorMixin):
@ -67,7 +36,6 @@ class FlavaProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "FlavaImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-    valid_processor_kwargs = FlavaProcessorKwargs

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        feature_extractor = None
--- a/src/transformers/models/florence2/processing_florence2.py
+++ b/src/transformers/models/florence2/processing_florence2.py
@ -39,7 +39,6 @@ logger = logging.get_logger(__name__)
 class Florence2ProcessorKwargs(ProcessingKwargs, total=False):
    _defaults = {
        "text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
-        "images_kwargs": {},
    }


--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@ -71,7 +71,6 @@ class FuyuProcessorKwargs(ProcessingKwargs, total=False):
            "verbose": True,
            "return_mm_token_type_ids": False,
        },
-        "images_kwargs": {},
    }


@ -487,8 +486,6 @@ class FuyuProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[FuyuProcessorKwargs],
    ) -> "FuyuBatchFeature":
        """
--- a/src/transformers/models/gemma3/image_processing_gemma3.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@ -40,6 +40,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging


@ -50,6 +51,24 @@ if is_vision_available():
    import PIL


+class Gemma3ImageProcessorKwargs(ImagesKwargs):
+    """
+    do_pan_and_scan (`bool`, *optional*):
+        Whether to apply `pan_and_scan` to images.
+    pan_and_scan_min_crop_size (`int`, *optional*):
+        Minimum size of each crop in pan and scan.
+    pan_and_scan_max_num_crops (`int`, *optional*):
+        Maximum number of crops per image in pan and scan.
+    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+        Minimum aspect ratio to activate pan and scan.
+    """
+
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+
+
 class Gemma3ImageProcessor(BaseImageProcessor):
    r"""
    Constructs a SigLIP image processor.
@ -91,6 +110,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "num_crops"]
+    valid_kwargs = Gemma3ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@ -24,7 +24,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
    BatchFeature,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -35,29 +34,12 @@ from ...utils import (
    auto_docstring,
    logging,
 )
+from .image_processing_gemma3 import Gemma3ImageProcessorKwargs


 logger = logging.get_logger(__name__)


-class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_pan_and_scan (`bool`, *optional*):
-        Whether to apply `pan_and_scan` to images.
-    pan_and_scan_min_crop_size (`int`, *optional*):
-        Minimum size of each crop in pan and scan.
-    pan_and_scan_max_num_crops (`int`, *optional*):
-        Maximum number of crops per image in pan and scan.
-    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
-        Minimum aspect ratio to activate pan and scan.
-    """
-
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
-
-
@auto_docstring
 class Gemma3ImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BILINEAR
@ -73,9 +55,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
    pan_and_scan_min_crop_size = None
    pan_and_scan_max_num_crops = None
    pan_and_scan_min_ratio_to_activate = None
-    valid_kwargs = Gemma3FastImageProcessorKwargs
+    valid_kwargs = Gemma3ImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Gemma3ImageProcessorKwargs]):
        super().__init__(**kwargs)

    def pan_and_scan_batched(
@ -167,7 +149,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
    def preprocess(
        self,
        images: ImageInput,
-        **kwargs: Unpack[Gemma3FastImageProcessorKwargs],
+        **kwargs: Unpack[Gemma3ImageProcessorKwargs],
    ) -> BatchFeature:
        return super().preprocess(images, **kwargs)

--- a/src/transformers/models/gemma3/processing_gemma3.py
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@ -20,21 +20,12 @@ import numpy as np

 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import to_py_obj


-class Gemma3ImagesKwargs(ImagesKwargs):
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
-    do_convert_rgb: Optional[bool]
-
-
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Gemma3ImagesKwargs
    _defaults = {
        "text_kwargs": {
            "padding": False,
@ -81,8 +72,6 @@ class Gemma3Processor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        videos=None,
-        audio=None,
        **kwargs: Unpack[Gemma3ProcessorKwargs],
    ) -> BatchFeature:
        if text is None and images is None:
--- a/src/transformers/models/gemma3n/processing_gemma3n.py
+++ b/src/transformers/models/gemma3n/processing_gemma3n.py
@ -19,21 +19,13 @@ import numpy as np

 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput


-class Gemma3nImagesKwargs(ImagesKwargs):
-    do_convert_rgb: Optional[bool]
-
-
 class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
-    audio_kwargs: AudioKwargs
-    images_kwargs: Gemma3nImagesKwargs
    _defaults = {
-        "text_kwargs": {
-            "padding": False,
-        },
+        "text_kwargs": {"padding": False},
    }


@ -101,7 +93,6 @@ class Gemma3nProcessor(ProcessorMixin):
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
        audio: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] = None,
-        videos=None,
        **kwargs: Unpack[Gemma3nProcessorKwargs],
    ) -> BatchFeature:
        if text is None and images is None and audio is None:
--- a/src/transformers/models/glm4v/image_processing_glm4v.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v.py
@ -39,6 +39,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, logging
 from ...video_utils import VideoInput

@ -46,6 +47,21 @@ from ...video_utils import VideoInput
 logger = logging.get_logger(__name__)


+class Glm4vImageProcessorKwargs(ImagesKwargs):
+    """
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
 def smart_resize(
    num_frames: int,
    height: int,
@ -120,6 +136,7 @@ class Glm4vImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "image_grid_thw"]
+    valid_kwargs = Glm4vImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py
@ -24,7 +24,6 @@ from ...image_processing_utils import (
 )
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -41,27 +40,12 @@ from ...utils import (
    auto_docstring,
    logging,
 )
-from .image_processing_glm4v import smart_resize
+from .image_processing_glm4v import Glm4vImageProcessorKwargs, smart_resize


 logger = logging.get_logger(__name__)


-class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
@auto_docstring
 class Glm4vImageProcessorFast(BaseImageProcessorFast):
    do_resize = True
@ -75,10 +59,10 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
    patch_size = 14
    temporal_patch_size = 2
    merge_size = 2
-    valid_kwargs = Glm4vFastImageProcessorKwargs
+    valid_kwargs = Glm4vImageProcessorKwargs
    model_input_names = ["pixel_values", "image_grid_thw"]

-    def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Glm4vImageProcessorKwargs]):
        super().__init__(**kwargs)
        if self.size is not None and (
            self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
@ -205,7 +189,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
    def preprocess(
        self,
        images: ImageInput,
-        **kwargs: Unpack[Glm4vFastImageProcessorKwargs],
+        **kwargs: Unpack[Glm4vImageProcessorKwargs],
    ) -> BatchFeature:
        return super().preprocess(images, **kwargs)

--- a/src/transformers/models/glm4v/modular_glm4v.py
+++ b/src/transformers/models/glm4v/modular_glm4v.py
@ -32,7 +32,7 @@ from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_rope_utils import rope_config_validation
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
 from ...utils.generic import check_model_inputs
@ -52,7 +52,6 @@ from ..qwen2_5_vl.modeling_qwen2_5_vl import (
    Qwen2_5_VLVisionAttention,
    Qwen2_5_VLVisionBlock,
 )
-from ..qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLVideosProcessorKwargs
 from ..qwen2_vl.processing_qwen2_vl import (
    Qwen2_VLProcessor,
    Qwen2_VLProcessorKwargs,
@ -1508,19 +1507,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
        return image_counts, video_counts


-class Glm4vVideosProcessorKwargs(Qwen2_5_VLVideosProcessorKwargs):
-    pass
-
-
-class Glm4vImagesKwargs(ImagesKwargs):
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 class Glm4vProcessorKwargs(Qwen2_VLProcessorKwargs):
-    images_kwargs: Glm4vImagesKwargs
-    videos_kwargs: Glm4vVideosProcessorKwargs
    _defaults = {
        "text_kwargs": {
            "padding": False,
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@ -24,7 +24,7 @@ import numpy as np

 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 from ...video_utils import VideoInput
@ -33,18 +33,7 @@ from ...video_utils import VideoInput
 logger = logging.get_logger(__name__)


-class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[list[float], float]
-
-
-class Glm4vImagesKwargs(ImagesKwargs):
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Glm4vImagesKwargs
    _defaults = {
        "text_kwargs": {
            "padding": False,
@ -53,7 +42,6 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
        },
        "videos_kwargs": {"return_metadata": True},
    }
-    videos_kwargs: Glm4vVideosProcessorKwargs


 class Glm4vProcessor(ProcessorMixin):
--- a/src/transformers/models/glm4v/video_processing_glm4v.py
+++ b/src/transformers/models/glm4v/video_processing_glm4v.py
@ -37,12 +37,11 @@ from .image_processing_glm4v import smart_resize


 class Glm4vVideoProcessorInitKwargs(VideosKwargs):
-    max_image_size: dict[str, int] = None
-    patch_size: Optional[int] = None
-    temporal_patch_size: Optional[int] = None
-    merge_size: Optional[int] = None
-    image_mean: Optional[list[float]] = None
-    image_std: Optional[list[float]] = None
+    max_image_size: Optional[dict[str, int]]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+    max_duration: Optional[int]


@add_start_docstrings(
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
@ -38,6 +38,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging


@ -48,6 +49,24 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class GotOcr2ImageProcessorKwargs(ImagesKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
 # Similar to image_processing_mllama.get_all_supported_aspect_ratios
@lru_cache(maxsize=10)
 def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]:
@ -168,6 +187,7 @@ class GotOcr2ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = GotOcr2ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -32,25 +31,7 @@ from ...utils import (
    TensorType,
    auto_docstring,
 )
-from .image_processing_got_ocr2 import get_optimal_tiled_canvas
-
-
-class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    crop_to_patches (`bool`, *optional*, defaults to `False`):
-        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
-        `preprocess` method.
-    min_patches (`int`, *optional*, defaults to 1):
-        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
-    max_patches (`int`, *optional*, defaults to 12):
-        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
-    """
-
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+from .image_processing_got_ocr2 import GotOcr2ImageProcessorKwargs, get_optimal_tiled_canvas


@auto_docstring
@ -66,13 +47,13 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
    crop_to_patches = False
    min_patches = 1
    max_patches = 12
-    valid_kwargs = GotOcr2FastImageProcessorKwargs
+    valid_kwargs = GotOcr2ImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def crop_image_to_patches(
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@ -18,11 +18,10 @@ from typing import Optional, Union

 import numpy as np

-from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available, logging


@ -37,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False):


 class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
    box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
    color: Optional[str]
    num_image_tokens: Optional[int]
    multi_page: Optional[bool]
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]


 class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
@ -136,8 +135,6 @@ class GotOcr2Processor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[GotOcr2ProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@ -49,8 +49,6 @@ class GraniteSpeechProcessor(ProcessorMixin):
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
        audio: Union["torch.Tensor", list["torch.Tensor"]] = None,
        device: str = "cpu",
-        images=None,
-        videos=None,
        **kwargs,
    ) -> BatchFeature:
        requires_backends(self, ["torch"])
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@ -51,6 +51,7 @@ from ...image_utils import (
    validate_kwargs,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    ExplicitEnum,
    TensorType,
@ -91,6 +92,29 @@ class AnnotationFormat(ExplicitEnum):
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


+class GroundingDinoImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
    """
@ -865,6 +889,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = GroundingDinoImageProcessorKwargs

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
    def __init__(
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
@ -4,6 +4,26 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_grounding_dino.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pathlib
 from typing import TYPE_CHECKING, Any, Optional, Union

@ -14,7 +34,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    SizeDict,
    get_image_size_for_max_height_width,
    get_max_height_width,
@ -35,7 +54,7 @@ from ...image_utils import (
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.import_utils import requires
-from .image_processing_grounding_dino import get_size_with_aspect_ratio
+from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs, get_size_with_aspect_ratio


 if TYPE_CHECKING:
@ -44,24 +63,6 @@ if TYPE_CHECKING:

 logger = logging.get_logger(__name__)

-
-class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


@ -304,9 +305,9 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
    size = {"shortest_edge": 800, "longest_edge": 1333}
    default_to_square = False
    model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = GroundingDinoFastImageProcessorKwargs
+    valid_kwargs = GroundingDinoImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[GroundingDinoImageProcessorKwargs]) -> None:
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")

@ -568,25 +569,8 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
    def preprocess(
        self,
        images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs],
+        **kwargs: Unpack[GroundingDinoImageProcessorKwargs],
    ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
        if "pad_and_return_pixel_mask" in kwargs:
            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
            logger.warning_once(
@ -601,7 +585,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
            )
            kwargs["size"] = kwargs.pop("max_size")

-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)

    def _preprocess(
        self,
--- a/src/transformers/models/grounding_dino/modular_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modular_grounding_dino.py
@ -1,3 +1,23 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Optional, Union

 import torch
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@ -16,13 +16,12 @@
 Processor class for Grounding DINO.
 """

-import pathlib
 import warnings
 from typing import TYPE_CHECKING, Optional, Union

 from ...image_transforms import center_to_corners_format
-from ...image_utils import AnnotationFormat, ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available

@ -99,16 +98,7 @@ class DictWithDeprecationWarning(dict):
        return super().get(key, *args, **kwargs)


-class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
-    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
-    return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
-    do_convert_annotations: Optional[bool]
-    format: Optional[Union[str, AnnotationFormat]]
-
-
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: GroundingDinoImagesKwargs
    _defaults = {
        "text_kwargs": {
            "add_special_tokens": True,
--- a/src/transformers/models/idefics/image_processing_idefics.py
+++ b/src/transformers/models/idefics/image_processing_idefics.py
@ -28,6 +28,7 @@ from ...image_utils import (
    to_numpy_array,
    valid_images,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_torch_available


@ -35,6 +36,20 @@ IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
 IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]


+class IdeficsImageProcessorKwargs(ImagesKwargs):
+    """
+    transform (`Callable`, *optional*):
+        A custom transform function that accepts a single image can be passed for training. For example,
+        `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+        assumed - and then a preset of inference-specific transforms will be applied to the images
+    image_size (`dict[str, int]`, *optional*):
+        Resize to image size
+    """
+
+    transform: Optional[Callable]
+    image_size: Optional[dict[str, int]]
+
+
 def convert_to_rgb(image):
    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
    # for transparent images. The call to `alpha_composite` handles this case
@ -74,6 +89,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = IdeficsImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@ -16,13 +16,12 @@
 Processor class for IDEFICS.
 """

-from typing import Callable, Optional, Union
+from typing import Optional, Union
 from urllib.parse import urlparse

 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import (
-    ImagesKwargs,
    ProcessingKwargs,
    ProcessorMixin,
    TextKwargs,
@ -40,13 +39,6 @@ if is_torch_available():
 IMAGE_TOKEN = "<image>"


-class IdeficsImagesKwargs(ImagesKwargs, total=False):
-    transform: Optional[Callable]
-    image_size: Optional[dict[str, int]]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
-
-
 class IdeficsTextKwargs(TextKwargs, total=False):
    add_eos_token: Optional[bool]
    add_end_of_utterance_token: Optional[bool]
@ -54,14 +46,12 @@ class IdeficsTextKwargs(TextKwargs, total=False):

 class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
    text_kwargs: IdeficsTextKwargs
-    images_kwargs: IdeficsImagesKwargs
    _defaults = {
        "text_kwargs": {
            "add_special_tokens": False,
            "padding": "longest",
            "add_eos_token": False,
        },
-        "images_kwargs": {},
        "common_kwargs": {"return_tensors": "pt"},
    }

@ -198,8 +188,6 @@ class IdeficsProcessor(ProcessorMixin):
            list[list[TextInput]],
            list[list[PreTokenizedInput]],
        ] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[IdeficsProcessorKwargs],
    ) -> BatchFeature:
        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
--- a/src/transformers/models/idefics2/image_processing_idefics2.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2.py
@ -35,6 +35,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging


@ -46,6 +47,15 @@ if is_vision_available():
    from PIL import Image


+class Idefics2ImageProcessorKwargs(ImagesKwargs):
+    """
+    do_image_splitting (`bool`, *optional*, defaults to `False`):
+        Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
+    """
+
+    do_image_splitting: Optional[bool]
+
+
 def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
    """
    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
@ -186,6 +196,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "pixel_attention_mask"]
+    valid_kwargs = Idefics2ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/idefics2/image_processing_idefics2_fast.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2_fast.py
@ -21,7 +21,6 @@ import torch
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
    BatchFeature,
-    DefaultFastImageProcessorKwargs,
    SizeDict,
    group_images_by_shape,
    reorder_images,
@ -35,7 +34,7 @@ from ...image_utils import (
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
-from .image_processing_idefics2 import convert_to_rgb
+from .image_processing_idefics2 import Idefics2ImageProcessorKwargs, convert_to_rgb


 if is_torchvision_available():
@ -105,15 +104,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
    return mask


-class Idefics2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_image_splitting (`bool`, *optional*, defaults to `False`):
-        Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
-    """
-
-    do_image_splitting: Optional[bool]
-
-
@auto_docstring
 class Idefics2ImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BILINEAR
@ -127,7 +117,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast):
    do_image_splitting = False
    size = {"shortest_edge": 378, "longest_edge": 980}
    model_input_names = ["pixel_values", "pixel_attention_mask"]
-    valid_kwargs = Idefics2FastImageProcessorKwargs
+    valid_kwargs = Idefics2ImageProcessorKwargs

    def convert_to_rgb(self, image: ImageInput) -> ImageInput:
        """
@ -214,7 +204,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast):
        return image, pixel_mask

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2ImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def _preprocess(
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Optional, Union
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
 from ...processing_utils import (
-    ImagesKwargs,
    ProcessingKwargs,
    ProcessorMixin,
    Unpack,
@ -46,20 +45,13 @@ def is_image_or_image_url(elem):
    return is_url(elem) or is_valid_image(elem)


-class Idefics2ImagesKwargs(ImagesKwargs, total=False):
-    image_seq_len: Optional[int]
-
-
 class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Idefics2ImagesKwargs
-
    _defaults = {
        "text_kwargs": {
            "add_special_tokens": True,
            "padding": False,
            "is_split_into_words": False,
        },
-        "images_kwargs": {},
    }


@ -123,8 +115,6 @@ class Idefics2Processor(ProcessorMixin):
        self,
        images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
        text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[Idefics2ProcessorKwargs],
    ) -> BatchFeature:
        """
@ -181,8 +171,6 @@ class Idefics2Processor(ProcessorMixin):
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )
-        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
-        image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)

        n_images_in_text = []
@ -197,12 +185,11 @@ class Idefics2Processor(ProcessorMixin):
            # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
            fake_image_token = self.fake_image_token
            image_token = self.image_token
-            image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
+            image_str = f"{fake_image_token}{image_token * self.image_seq_len}{fake_image_token}"

            if self.image_processor.do_image_splitting:
                # A single image token is split into 4 patches + 1 original image
                image_str = image_str * 5
-                image_seq_len *= 5

            prompt_strings = []
            for sample in text:
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@ -35,6 +35,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging


@ -47,6 +48,22 @@ if is_vision_available():
    from PIL import Image


+class Idefics3ImageProcessorKwargs(ImagesKwargs):
+    """
+    do_image_splitting (`bool`, *optional*, defaults to `True`):
+        Whether to split the image into sub-images concatenated with the original image. They are split into patches
+        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
+    return_row_col_info (`bool`, *optional*, defaults to `False`):
+        Whether to return the row and column information of the images.
+    """
+
+    do_image_splitting: Optional[bool]
+    max_image_size: Optional[dict[str, int]]
+    return_row_col_info: Optional[bool]
+
+
 def _resize_output_size_rescale_to_max_len(
    height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
 ) -> tuple[int, int]:
@ -291,6 +308,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values", "pixel_attention_mask"]
+    valid_kwargs = Idefics3ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/idefics3/image_processing_idefics3_fast.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
@ -22,7 +22,6 @@ import torch
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
    BatchFeature,
-    DefaultFastImageProcessorKwargs,
    SizeDict,
    group_images_by_shape,
    reorder_images,
@ -36,6 +35,7 @@ from ...image_utils import (
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
+from .image_processing_idefics3 import Idefics3ImageProcessorKwargs


 if is_torchvision_available():
@ -169,22 +169,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
    return mask


-class Idefics3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_image_splitting (`bool`, *optional*, defaults to `True`):
-        Whether to split the image into sub-images concatenated with the original image. They are split into patches
-        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
-    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
-        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
-    return_row_col_info (`bool`, *optional*, defaults to `False`):
-        Whether to return the row and column information of the images.
-    """
-
-    do_image_splitting: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-    return_row_col_info: Optional[bool]
-
-
@auto_docstring
 class Idefics3ImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.LANCZOS
@ -199,7 +183,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
    do_image_splitting = True
    do_pad = True
    return_row_col_info = False
-    valid_kwargs = Idefics3FastImageProcessorKwargs
+    valid_kwargs = Idefics3ImageProcessorKwargs

    def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
        """
@ -367,7 +351,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
        return image, pixel_mask

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3ImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def _preprocess(
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@ -24,7 +24,7 @@ import numpy as np

 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
 from ...utils import logging

@ -87,14 +87,7 @@ def get_image_prompt_string(
    )


-class Idefics3ImagesKwargs(ImagesKwargs, total=False):
-    return_row_col_info: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-
-
 class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Idefics3ImagesKwargs
-
    _defaults = {
        "text_kwargs": {
            "add_special_tokens": True,
@ -179,8 +172,6 @@ class Idefics3Processor(ProcessorMixin):
        self,
        images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
        text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
-        audio=None,
-        videos=None,
        image_seq_len: Optional[int] = None,
        **kwargs: Unpack[Idefics3ProcessorKwargs],
    ) -> BatchEncoding:
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@ -31,17 +31,34 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
-from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...processing_utils import ImagesKwargs
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, is_vision_available, logging
 from ...utils.import_utils import requires


 if is_vision_available():
    import PIL

+if is_torch_available():
+    import torch

 logger = logging.get_logger(__name__)


+class ImageGPTImageProcessorKwargs(ImagesKwargs):
+    """
+    clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
+        The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+        in `preprocess`.
+    do_color_quantize (`bool`, *optional*, defaults to `True`):
+        Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
+        When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
+    """
+
+    clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
+    do_color_quantize: Optional[bool]
+
+
 def squared_euclidean_distance(a, b):
    b = b.T
    a2 = np.sum(np.square(a), axis=1)
@ -83,6 +100,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = ImageGPTImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@ -23,7 +23,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
 )
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import PILImageResampling
@ -32,6 +31,7 @@ from ...utils import (
    TensorType,
    auto_docstring,
 )
+from .image_processing_imagegpt import ImageGPTImageProcessorKwargs


 def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
@ -68,20 +68,6 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso
    return torch.argmin(d, dim=1)


-class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
-        The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
-        in `preprocess`.
-    do_color_quantize (`bool`, *optional*, defaults to `True`):
-        Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
-        When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
-    """
-
-    clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]]
-    do_color_quantize: Optional[bool]
-
-
@auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
    model_input_names = ["input_ids"]
@ -92,12 +78,12 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
    image_std = [0.5, 0.5, 0.5]
    do_rescale = True
    do_normalize = True
-    valid_kwargs = ImageGPTFastImageProcessorKwargs
+    valid_kwargs = ImageGPTImageProcessorKwargs

    def __init__(
        self,
        clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,  # keep as arg for backwards compatibility
-        **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
+        **kwargs: Unpack[ImageGPTImageProcessorKwargs],
    ):
        r"""
        clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@ -43,7 +43,6 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False):
            "return_length": False,
            "verbose": True,
        },
-        "images_kwargs": {},
    }


@ -85,8 +84,6 @@ class InstructBlipProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[InstructBlipProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@ -19,19 +19,12 @@ import numpy as np

 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...video_utils import VideoInput


-class InternVLImagesKwargs(ImagesKwargs, total=False):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-
-
 class InternVLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: InternVLImagesKwargs
    _defaults = {
        "text_kwargs": {
            "padding_side": "left",
@ -159,7 +152,6 @@ class InternVLProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
        videos: Optional[VideoInput] = None,
        **kwargs: Unpack[InternVLProcessorKwargs],
    ) -> BatchFeature:
--- a/src/transformers/models/internvl/video_processing_internvl.py
+++ b/src/transformers/models/internvl/video_processing_internvl.py
@ -43,7 +43,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
    initial_shift = True
    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = InternVLVideoProcessorInitKwargs
-    model_input_names = ["pixel_values_videos"]

    def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]):
        super().__init__(**kwargs)
--- a/src/transformers/models/janus/image_processing_janus.py
+++ b/src/transformers/models/janus/image_processing_janus.py
@ -40,6 +40,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging


@ -50,6 +51,16 @@ if is_vision_available():
 logger = logging.get_logger(__name__)


+class JanusImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 class JanusImageProcessor(BaseImageProcessor):
    r"""
    Constructs a JANUS image processor.
@ -92,6 +103,8 @@ class JanusImageProcessor(BaseImageProcessor):

    model_input_names = ["pixel_values"]

+    valid_kwargs = JanusImageProcessorKwargs
+
    def __init__(
        self,
        do_resize: bool = True,
--- a/src/transformers/models/janus/image_processing_janus_fast.py
+++ b/src/transformers/models/janus/image_processing_janus_fast.py
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -38,16 +37,7 @@ from ...utils import (
    TensorType,
    auto_docstring,
 )
-
-
-class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
+from .image_processing_janus import JanusImageProcessorKwargs


@auto_docstring
@ -61,9 +51,9 @@ class JanusImageProcessorFast(BaseImageProcessorFast):
    do_rescale = True
    do_normalize = True
    do_pad = True
-    valid_kwargs = JanusFastImageProcessorKwargs
+    valid_kwargs = JanusImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[JanusImageProcessorKwargs]):
        if kwargs.get("image_mean") is None:
            background_color = (127, 127, 127)
        else:
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@ -47,7 +47,7 @@ from ...image_utils import (
 )
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
    TensorType,
    TransformersKwargs,
@ -1289,6 +1289,16 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
            return generated_tokens


+class JanusImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 class JanusImageProcessor(BlipImageProcessor):
    r"""
    Constructs a JANUS image processor.
@ -1329,6 +1339,8 @@ class JanusImageProcessor(BlipImageProcessor):
            Whether to pad the image to square or not.
    """

+    valid_kwargs = JanusImageProcessorKwargs
+
    def __init__(
        self,
        do_resize: bool = True,
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@ -81,8 +81,6 @@ class JanusProcessor(ProcessorMixin):
        self,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
        images: Optional[ImageInput] = None,
-        videos=None,
-        audio=None,
        **kwargs: Unpack[JanusProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@ -136,8 +136,6 @@ class Kosmos2Processor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, list[TextInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[Kosmos2ProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
@ -34,6 +34,7 @@ from ...image_utils import (
    to_numpy_array,
    valid_images,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_torch_available, logging
 from ...utils.import_utils import requires_backends

@ -45,6 +46,19 @@ logger = logging.get_logger(__name__)
 DEFAULT_FONT_PATH = "ybelkada/fonts"


+class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
+    r"""
+    patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+        The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
+    max_patches (`int`, *optional*, defaults to 4096):
+        The maximum number of patches to extract from the image as per the
+        [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
+    """
+
+    patch_size: Optional[dict[str, int]]
+    max_patches: Optional[int]
+
+
 # Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
 def torch_extract_patches(image_tensor, patch_height, patch_width):
    """
@ -92,6 +106,7 @@ class Kosmos2_5ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["flattened_patches"]
+    valid_kwargs = Kosmos2_5ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
+++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
@ -22,13 +22,13 @@ import torch
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
 from ...image_utils import ChannelDimension, ImageInput, get_image_size
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
+from .image_processing_kosmos2_5 import Kosmos2_5ImageProcessorKwargs


 # Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
@ -56,19 +56,6 @@ def torch_extract_patches(image_tensor, patch_height, patch_width):
    return patches


-class Kosmos2_5FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
-        The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
-    max_patches (`int`, *optional*, defaults to 4096):
-        The maximum number of patches to extract from the image as per the
-        [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
-    """
-
-    patch_size: Optional[dict[str, int]]
-    max_patches: Optional[int]
-
-
@auto_docstring
 class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
    # To be checked against the slow image processor
@ -78,13 +65,13 @@ class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
    patch_size = {"height": 16, "width": 16}
    max_patches = 4096
    rescale_factor = None
-    valid_kwargs = Kosmos2_5FastImageProcessorKwargs
+    valid_kwargs = Kosmos2_5ImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]) -> BatchFeature:
        r"""
        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
            The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@ -20,7 +20,7 @@ from typing import Optional, Union

 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import TextInput
 from ...utils import is_torch_available

@ -29,14 +29,7 @@ if is_torch_available():
    import torch


-class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
-    max_patches: Optional[int]
-    num_image_tokens: Optional[int]
-
-
 class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
-    text_kwargs: TextKwargs
-    images_kwargs: Kosmos2_5ImagesKwargs
    _defaults = {
        "text_kwargs": {
            "padding": True,
@ -46,7 +39,6 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
        },
        "images_kwargs": {
            "max_patches": 4096,
-            "num_image_tokens": 2048,
        },
        "common_kwargs": {"return_tensors": "pt"},
    }
@ -65,24 +57,25 @@ class Kosmos2_5Processor(ProcessorMixin):
            An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+        num_image_tokens (`int`, *optional*, defaults to 2048):
+            Number of image tokens used as a placeholder.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "PreTrainedTokenizerFast"

-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
        self.image_start_token = tokenizer.boi_token  # "<image>" : fixed token for the start of image
        self.image_end_token = tokenizer.eoi_token  # "</image>" : fixed token for the end of image
        self.image_token = tokenizer.image_token  # "<s>" : within a <image> ... </image> pair, these <s> tokens indicate they are positions reserved for an image
+        self.num_image_tokens = num_image_tokens
        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, list[TextInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[Kosmos2_5ProcessorKwargs],
    ) -> BatchFeature:
        """
@ -104,8 +97,6 @@ class Kosmos2_5Processor(ProcessorMixin):
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )
-        num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)
-
        encoding = BatchFeature()

        if images is not None:
@ -114,7 +105,7 @@ class Kosmos2_5Processor(ProcessorMixin):
            image_encoding.pop("cols")
            encoding.update(image_encoding)

-        prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * num_image_tokens}{self.image_end_token}"
+        prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * self.num_image_tokens}{self.image_end_token}"

        if text is not None:
            if isinstance(text, str):
@ -124,7 +115,7 @@ class Kosmos2_5Processor(ProcessorMixin):
            input = self.tokenizer(text, **output_kwargs["text_kwargs"])

            batch_size, seq_len = input.input_ids.shape
-            image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1]
+            image_embeds_position_mask = [0, -1] + [1] * self.num_image_tokens + [-1]
            image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask))
            image_embeds_position_mask = (
                torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@ -30,6 +30,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    filter_out_non_signature_kwargs,
@ -51,6 +52,25 @@ if is_pytesseract_available():
 logger = logging.get_logger(__name__)


+class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
+    r"""
+    apply_ocr (`bool`, *optional*, defaults to `True`):
+        Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+        the `apply_ocr` parameter in the `preprocess` method.
+    ocr_lang (`str`, *optional*):
+        The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+        used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+    tesseract_config (`str`, *optional*):
+        Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+        Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+        `preprocess` method.
+    """
+
+    apply_ocr: Optional[bool]
+    ocr_lang: Optional[str]
+    tesseract_config: Optional[str]
+
+
 def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),
@ -125,6 +145,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = LayoutLMv2ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py
@ -19,7 +19,7 @@ from typing import Optional, Union
 import torch
 from torchvision.transforms.v2 import functional as F

-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
 from ...image_utils import ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@ -29,32 +29,12 @@ from ...utils import (
    logging,
    requires_backends,
 )
-from .image_processing_layoutlmv2 import apply_tesseract
+from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessorKwargs, apply_tesseract


 logger = logging.get_logger(__name__)


-class LayoutLMv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        apply_ocr (`bool`, *optional*, defaults to `True`):
-            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
-            the `apply_ocr` parameter in the `preprocess` method.
-        ocr_lang (`str`, *optional*):
-            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
-            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
-        tesseract_config (`str`, *optional*):
-            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
-            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
-            `preprocess` method.
-    """
-
-    apply_ocr: Optional[bool]
-    ocr_lang: Optional[str]
-    tesseract_config: Optional[str]
-
-
@auto_docstring
 class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BILINEAR
@ -64,13 +44,13 @@ class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
    apply_ocr = True
    ocr_lang = None
    tesseract_config = ""
-    valid_kwargs = LayoutLMv2FastImageProcessorKwargs
+    valid_kwargs = LayoutLMv2ImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def _preprocess(
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@ -34,6 +34,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    filter_out_non_signature_kwargs,
@ -55,6 +56,25 @@ if is_pytesseract_available():
 logger = logging.get_logger(__name__)


+class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
+    r"""
+    apply_ocr (`bool`, *optional*, defaults to `True`):
+        Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+        the `apply_ocr` parameter in the `preprocess` method.
+    ocr_lang (`str`, *optional*):
+        The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+        used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+    tesseract_config (`str`, *optional*):
+        Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+        Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+        `preprocess` method.
+    """
+
+    apply_ocr: Optional[bool]
+    ocr_lang: Optional[str]
+    tesseract_config: Optional[str]
+
+
 def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),
@ -143,6 +163,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = LayoutLMv3ImageProcessorKwargs

    def __init__(
        self,
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py
@ -19,7 +19,7 @@ from typing import Optional, Union
 import torch
 from torchvision.transforms.v2 import functional as F

-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@ -29,32 +29,12 @@ from ...utils import (
    logging,
    requires_backends,
 )
-from .image_processing_layoutlmv3 import apply_tesseract
+from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessorKwargs, apply_tesseract


 logger = logging.get_logger(__name__)


-class LayoutLMv3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        apply_ocr (`bool`, *optional*, defaults to `True`):
-            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
-            the `apply_ocr` parameter in the `preprocess` method.
-        ocr_lang (`str`, *optional*):
-            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
-            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
-        tesseract_config (`str`, *optional*):
-            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
-            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
-            `preprocess` method.
-    """
-
-    apply_ocr: Optional[bool]
-    ocr_lang: Optional[str]
-    tesseract_config: Optional[str]
-
-
@auto_docstring
 class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BILINEAR
@ -67,13 +47,13 @@ class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
    apply_ocr = True
    ocr_lang = None
    tesseract_config = ""
-    valid_kwargs = LayoutLMv3FastImageProcessorKwargs
+    valid_kwargs = LayoutLMv3ImageProcessorKwargs

-    def __init__(self, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def _preprocess(
--- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
@ -22,7 +22,6 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
@ -33,9 +32,7 @@ from ...image_utils import (
    PILImageResampling,
    SizeDict,
 )
-from ...processing_utils import (
-    Unpack,
-)
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
    TensorType,
    auto_docstring,
@ -172,7 +169,7 @@ def pad_along_first_dim(
    return images, pixel_mask


-class Lfm2VlFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class Lfm2VlImageProcessorKwargs(ImagesKwargs):
    """
    downsample_factor (`int`, *optional*, defaults to `2`):
        The downsampling factor for images used when resizing the image.
@ -214,10 +211,10 @@ class Lfm2VlImageProcessorFast(BaseImageProcessorFast):
    return_row_col_info = False
    image_mean = IMAGENET_STANDARD_STD
    image_std = IMAGENET_STANDARD_MEAN
-    valid_kwargs = Lfm2VlFastImageProcessorKwargs
+    valid_kwargs = Lfm2VlImageProcessorKwargs
    model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]

-    def __init__(self, **kwargs: Unpack[Lfm2VlFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Lfm2VlImageProcessorKwargs]):
        super().__init__(**kwargs)

        max_thumbnail_image_patches = self.max_image_tokens * self.downsample_factor**2
--- a/src/transformers/models/llama4/image_processing_llama4_fast.py
+++ b/src/transformers/models/llama4/image_processing_llama4_fast.py
@ -25,12 +25,11 @@ from torchvision.transforms.v2 import functional as F
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
    group_images_by_shape,
    reorder_images,
 )
 from ...image_utils import ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
    TensorType,
    auto_docstring,
@ -309,8 +308,8 @@ def get_best_fit(
    return optimal_canvas


-class Llama4ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
+class Llama4ImageProcessorKwargs(ImagesKwargs):
+    r"""
    max_patches (`int`, *optional*, defaults to 16):
        The maximum number of patches to be extracted from the image.
        Can be overridden by the `max_patches` parameter in the `preprocess` method.
--- a/src/transformers/models/llama4/processing_llama4.py
+++ b/src/transformers/models/llama4/processing_llama4.py
@ -16,20 +16,14 @@

 from typing import Optional, Union

-from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images


-class Llama4ImagesKwargs(ImagesKwargs, total=False):
-    max_patches: Optional[int]
-    resize_to_max_canvas: Optional[bool]
-
-
 class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Llama4ImagesKwargs
    _defaults = {
        "text_kwargs": {
            "padding_side": "left",
@ -139,8 +133,6 @@ class Llama4Processor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[Llama4ProcessorKwargs],
    ) -> BatchFeature:
        """
--- a/Show More
+++ b/Show More