[processors] Unbloating simple processors (#40377)

* modularize processor - step 1 * typos * why raise error, super call check it also * tiny update * fix copies * fix style and test * lost an import / fix copies * fix tests * oops deleted accidentally
2025-10-20 17:13:56 +08:00 · 2025-09-10 10:37:19 +02:00
parent c52889bd51
commit 08edec9f7d
66 changed files with 286 additions and 1303 deletions
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@ -16,11 +16,7 @@
 Image/Text processor class for ALIGN
 """

-from typing import Optional, Union
-
-from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin


 class AlignProcessorKwargs(ProcessingKwargs, total=False):
@ -66,74 +62,10 @@ class AlignProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "EfficientNetImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+    valid_processor_kwargs = AlignProcessorKwargs

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)

-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[AlignProcessorKwargs],
-    ) -> BatchEncoding:
-        """
-        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
-        arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` arguments to
-        EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
-        to the docstring of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'tf'`: Return TensorFlow `tf.constant` objects.
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-                    - `'jax'`: Return JAX `jnp.ndarray` objects.
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-        if text is None and images is None:
-            raise ValueError("You must specify either text or images.")
-
-        output_kwargs = self._merge_kwargs(
-            AlignProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-        # then, we can pass correct kwargs to each processor
-        if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
-        if images is not None:
-            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        # BC for explicit return_tensors
-        if "return_tensors" in output_kwargs["common_kwargs"]:
-            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
-
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
-

 __all__ = ["AlignProcessor"]
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@ -16,18 +16,10 @@
 Image/Text processor class for AltCLIP
 """

-from typing import Optional, Union
-
-from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...processing_utils import ProcessorMixin
 from ...utils.deprecation import deprecate_kwarg


-class AltClipProcessorKwargs(ProcessingKwargs, total=False):
-    _defaults = {}
-
-
 class AltCLIPProcessor(ProcessorMixin):
    r"""
    Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single
@ -49,80 +41,7 @@ class AltCLIPProcessor(ProcessorMixin):

    @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
    def __init__(self, image_processor=None, tokenizer=None):
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        super().__init__(image_processor, tokenizer)

-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[AltClipProcessorKwargs],
-    ) -> BatchEncoding:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
-        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'tf'`: Return TensorFlow `tf.constant` objects.
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-                    - `'jax'`: Return JAX `jnp.ndarray` objects.
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-
-        if text is None and images is None:
-            raise ValueError("You must specify either text or images.")
-
-        if text is None and images is None:
-            raise ValueError("You must specify either text or images.")
-        output_kwargs = self._merge_kwargs(
-            AltClipProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        if images is not None:
-            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        # BC for explicit return_tensors
-        if "return_tensors" in output_kwargs["common_kwargs"]:
-            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
-
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
-

 __all__ = ["AltCLIPProcessor"]
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@ -16,10 +16,7 @@
 Processor class for BridgeTower.
 """

-from typing import Union
-
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin


 class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
@ -60,35 +57,10 @@ class BridgeTowerProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "BridgeTowerImageProcessor"
    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
+    valid_processor_kwargs = BridgeTowerProcessorKwargs

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)

-    def __call__(
-        self,
-        images,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[BridgeTowerProcessorKwargs],
-    ) -> BatchEncoding:
-        """
-        This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`RobertaTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
-        output_kwargs = self._merge_kwargs(
-            BridgeTowerProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-        encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
-        # add pixel_values + pixel_mask
-        encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
-        encoding.update(encoding_image_processor)
-
-        return encoding
-

 __all__ = ["BridgeTowerProcessor"]
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@ -16,11 +16,22 @@
 Processor class for Bros.
 """

-from typing import Optional, Union
+from ...processing_utils import ProcessingKwargs, ProcessorMixin

-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+
+class BrosProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_length": False,
+            "verbose": True,
+        },
+    }


 class BrosProcessor(ProcessorMixin):
@ -37,6 +48,7 @@ class BrosProcessor(ProcessorMixin):

    attributes = ["tokenizer"]
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+    valid_processor_kwargs = BrosProcessorKwargs

    def __init__(self, tokenizer=None, **kwargs):
        if tokenizer is None:
@ -44,50 +56,5 @@ class BrosProcessor(ProcessorMixin):

        super().__init__(tokenizer)

-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        This method uses [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
-        encoding = self.tokenizer(
-            text=text,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            return_tensors=return_tensors,
-            **kwargs,
-        )
-
-        return encoding
-

 __all__ = ["BrosProcessor"]
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@ -99,7 +99,7 @@ class ChameleonProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@ -17,15 +17,8 @@ Image/Text processor class for Chinese-CLIP
 """

 import warnings
-from typing import Optional, Union

-from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-
-
-class ChineseClipProcessorKwargs(ProcessingKwargs, total=False):
-    _defaults = {}
+from ...processing_utils import ProcessorMixin


 class ChineseCLIPProcessor(ProcessorMixin):
@ -58,79 +51,10 @@ class ChineseCLIPProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor

-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        images: Optional[ImageInput] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[ChineseClipProcessorKwargs],
-    ) -> BatchEncoding:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'tf'`: Return TensorFlow `tf.constant` objects.
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-                    - `'jax'`: Return JAX `jnp.ndarray` objects.
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-        output_kwargs = self._merge_kwargs(
-            ChineseClipProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        if images is not None:
-            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        # BC for explicit return_tensors
-        if "return_tensors" in output_kwargs["common_kwargs"]:
-            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
-
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
-
    @property
    def feature_extractor_class(self):
        warnings.warn(
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@ -16,8 +16,16 @@
 Audio/Text processor class for CLAP
 """

-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from typing import Optional, Union
+
+from ...audio_utils import AudioInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+
+
+logger = logging.get_logger(__name__)


 class ClapProcessor(ProcessorMixin):
@ -40,61 +48,28 @@ class ClapProcessor(ProcessorMixin):
    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)

-    def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
+    @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audios: Optional[AudioInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
-        encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
-        ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
-        docstring of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            audios (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
-                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
-                and T the sample length of the audio.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **audio_features** -- Audio features to be fed to a model. Returned when `audios` is not `None`.
+        Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text`
+        argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more
+        information.
        """
-        sampling_rate = kwargs.pop("sampling_rate", None)
-
-        if text is None and audios is None:
-            raise ValueError("You have to specify either text or audios. Both cannot be none.")
-
-        if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
-
-        if audios is not None:
-            audio_features = self.feature_extractor(
-                audios, sampling_rate=sampling_rate, return_tensors=return_tensors, **kwargs
+        # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check
+        # again that the correct naming is used
+        if audios is not None and audio is None:
+            logger.warning(
+                "Using `audios` keyword argument is deprecated when calling ClapProcessor, instead use `audio`."
            )
+            audio = audios

-        if text is not None and audios is not None:
-            encoding.update(audio_features)
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**audio_features), tensor_type=return_tensors)
+        return super().__call__(text=text, audio=audio, **kwargs)


 __all__ = ["ClapProcessor"]
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@ -19,7 +19,6 @@ Image/Text processor class for CLIP
 import warnings

 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding


 class CLIPProcessor(ProcessorMixin):
@ -51,71 +50,9 @@ class CLIPProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-        tokenizer_kwargs, image_processor_kwargs = {}, {}
-        if kwargs:
-            tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys}
-            image_processor_kwargs = {
-                k: v for k, v in kwargs.items() if k in self.image_processor._valid_processor_keys
-            }
-
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
-        if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **tokenizer_kwargs)
-
-        if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **image_processor_kwargs)
-
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
-
    @property
    def feature_extractor_class(self):
        warnings.warn(
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@ -51,10 +51,6 @@ class CLIPSegProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

@ -62,7 +58,7 @@ class CLIPSegProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
        the above two methods for more information.

--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@ -18,6 +18,10 @@ Processor class for CLVP
 """

 from ...processing_utils import ProcessorMixin
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)


 class ClvpProcessor(ProcessorMixin):
@ -36,11 +40,6 @@ class ClvpProcessor(ProcessorMixin):

    feature_extractor_class = "ClvpFeatureExtractor"
    tokenizer_class = "ClvpTokenizer"
-    model_input_names = [
-        "input_ids",
-        "input_features",
-        "attention_mask",
-    ]

    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)
@ -51,27 +50,13 @@ class ClvpProcessor(ProcessorMixin):
        argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
        information.
        """
-
        raw_speech = kwargs.pop("raw_speech", None)
-        sampling_rate = kwargs.pop("sampling_rate", None)
-        text = kwargs.pop("text", None)
-
-        if raw_speech is None and text is None:
-            raise ValueError("You need to specify either an `raw_speech` or `text` input to process.")
-
        if raw_speech is not None:
-            inputs = self.feature_extractor(raw_speech, sampling_rate=sampling_rate, **kwargs)
-        if text is not None:
-            encodings = self.tokenizer(text, **kwargs)
-
-        if text is None:
-            return inputs
-        elif raw_speech is None:
-            return encodings
-        else:
-            inputs["input_ids"] = encodings["input_ids"]
-            inputs["attention_mask"] = encodings["attention_mask"]
-            return inputs
+            logger.warning(
+                "Using `raw_speech` keyword argument is deprecated when calling ClvpProcessor, instead use `audio`."
+            )
+        kwargs["audio"] = raw_speech
+        return super().__call__(*args, **kwargs)


 __all__ = ["ClvpProcessor"]
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@ -106,10 +106,6 @@ class ColPaliProcessor(ProcessorMixin):
        query_prefix: str = "Question: ",
    ):
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
        if not hasattr(image_processor, "image_seq_length"):
            raise ValueError("Image processor is missing an `image_seq_length` attribute.")

--- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@ -249,7 +249,7 @@ class DeepseekVLProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

--- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
@ -78,7 +78,7 @@ class DeepseekVLProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@ -926,7 +926,7 @@ class DeepseekVLHybridProcessor(DeepseekVLProcessor):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

--- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
@ -78,7 +78,7 @@ class DeepseekVLHybridProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@ -65,10 +65,6 @@ class DonutProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@ -102,7 +102,7 @@ class Emu3Processor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@ -17,12 +17,39 @@ Image/Text processor class for FLAVA
 """

 import warnings
+from collections.abc import Iterable
 from typing import Optional, Union

-from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+
+
+class FlavaImagesKwargs(ImagesKwargs):
+    # Mask related params
+    return_image_mask: Optional[bool]
+    input_size_patches: Optional[int]
+    total_mask_patches: Optional[int]
+    mask_group_min_patches: Optional[int]
+    mask_group_max_patches: Optional[int]
+    mask_group_min_aspect_ratio: Optional[float]
+    mask_group_max_aspect_ratio: Optional[float]
+    # Codebook related params
+    return_codebook_pixels: Optional[bool]
+    codebook_do_resize: Optional[bool]
+    codebook_size: Optional[bool]
+    codebook_resample: Optional[int]
+    codebook_do_center_crop: Optional[bool]
+    codebook_crop_size: Optional[int]
+    codebook_do_rescale: Optional[bool]
+    codebook_rescale_factor: Optional[Union[int, float]]
+    codebook_do_map_pixels: Optional[bool]
+    codebook_do_normalize: Optional[bool]
+    codebook_image_mean: Optional[Union[float, Iterable[float]]]
+    codebook_image_std: Optional[Union[float, Iterable[float]]]
+
+
+class FlavaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: FlavaImagesKwargs
+    _defaults = {}


 class FlavaProcessor(ProcessorMixin):
@ -40,6 +67,7 @@ class FlavaProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "FlavaImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+    valid_processor_kwargs = FlavaProcessorKwargs

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        feature_extractor = None
@ -52,82 +80,9 @@ class FlavaProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor

-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = False,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_image_mask: Optional[bool] = None,
-        return_codebook_pixels: Optional[bool] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
-    ):
-        """
-        This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
-
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
-        if text is not None:
-            encoding = self.tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
-        if images is not None:
-            image_features = self.image_processor(
-                images,
-                return_image_mask=return_image_mask,
-                return_codebook_pixels=return_codebook_pixels,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
-
-        if text is not None and images is not None:
-            encoding.update(image_features)
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
-
    @property
    def feature_extractor_class(self):
        warnings.warn(
--- a/src/transformers/models/florence2/modular_florence2.py
+++ b/src/transformers/models/florence2/modular_florence2.py
@ -357,7 +357,7 @@ class Florence2Processor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/florence2/processing_florence2.py
+++ b/src/transformers/models/florence2/processing_florence2.py
@ -158,7 +158,7 @@ class Florence2Processor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@ -16,20 +16,7 @@
 Image/Text processor class for GIT
 """

-from typing import Optional, Union
-
-from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import logging
-
-
-class GitProcessorKwargs(ProcessingKwargs, total=False):
-    _defaults = {}
-
-
-logger = logging.get_logger(__name__)
+from ...processing_utils import ProcessorMixin


 class GitProcessor(ProcessorMixin):
@ -54,65 +41,5 @@ class GitProcessor(ProcessorMixin):
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor

-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[GitProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
-        output_kwargs = self._merge_kwargs(
-            GitProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        data = {}
-        if text is not None:
-            text_features = self.tokenizer(text, **output_kwargs["text_kwargs"])
-            data.update(text_features)
-        if images is not None:
-            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-            data.update(image_features)
-
-        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
-

 __all__ = ["GitProcessor"]
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@ -144,7 +144,7 @@ class GotOcr2Processor(ProcessorMixin):
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
-        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
+        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.

        Args:
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@ -20,7 +20,6 @@ import pathlib
 import warnings
 from typing import TYPE_CHECKING, Optional, Union

-from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import AnnotationFormat, ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
@ -144,6 +143,7 @@ class GroundingDinoProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "GroundingDinoImageProcessor"
    tokenizer_class = "AutoTokenizer"
+    valid_processor_kwargs = GroundingDinoProcessorKwargs

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)
@ -152,8 +152,6 @@ class GroundingDinoProcessor(ProcessorMixin):
        self,
        images: Optional[ImageInput] = None,
        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
        **kwargs: Unpack[GroundingDinoProcessorKwargs],
    ) -> BatchEncoding:
        """
@ -170,33 +168,9 @@ class GroundingDinoProcessor(ProcessorMixin):
                - A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog.").
                - A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]).
        """
-        if images is None and text is None:
-            raise ValueError("You must specify either text or images.")
-
-        output_kwargs = self._merge_kwargs(
-            GroundingDinoProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        # Get only text
-        if images is not None:
-            encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
-        else:
-            encoding_image_processor = BatchFeature()
-
        if text is not None:
            text = self._preprocess_input_text(text)
-            text_encoding = self.tokenizer(
-                text=text,
-                **output_kwargs["text_kwargs"],
-            )
-        else:
-            text_encoding = BatchEncoding()
-
-        text_encoding.update(encoding_image_processor)
-
-        return text_encoding
+        return super().__call__(images=images, text=text, **kwargs)

    def _preprocess_input_text(self, text):
        """
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@ -214,11 +214,6 @@ class IdeficsProcessor(ProcessorMixin):
    tokenizer_class = "LlamaTokenizerFast"

    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
        self.image_token_id = (
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@ -90,11 +90,6 @@ class Idefics2Processor(ProcessorMixin):
    def __init__(
        self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
    ):
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        if not hasattr(tokenizer, "image_token"):
            self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
            self.image_token = AddedToken("<image>", normalized=False, special=True).content
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@ -138,11 +138,6 @@ class Idefics3Processor(ProcessorMixin):
    def __init__(
        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
    ):
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
        self.image_token = AddedToken("<image>", normalized=False, special=True).content
        self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@ -165,7 +165,7 @@ class InternVLProcessor(ProcessorMixin):
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
-        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
+        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.

        Args:
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@ -88,7 +88,7 @@ class JanusProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        JanusImageProcessor's [`~JanusImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

--- a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
@ -13,10 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import Optional

-from ...audio_utils import AudioInput, make_list_of_audio
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin


 class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
@ -38,53 +36,7 @@ class KyutaiSpeechToTextProcessor(ProcessorMixin):

    feature_extractor_class = "KyutaiSpeechToTextFeatureExtractor"
    tokenizer_class = "PreTrainedTokenizerFast"
-
-    def __call__(
-        self,
-        audio: Optional[AudioInput] = None,
-        **kwargs: Unpack[KyutaiSpeechToTextProcessorKwargs],
-    ):
-        r"""
-        Main method to prepare audio to be fed as input to the model. This method forwards the `audio`
-        arguments to KyutaiSpeechToTextFeatureExtractor's [`~KyutaiSpeechToTextFeatureExtractor.__call__`]. Please refer
-        to the docstring of the above method for more information.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
-                tensor.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'tf'`: Return TensorFlow `tf.constant` objects.
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-                    - `'jax'`: Return JAX `jnp.ndarray` objects.
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
-            - **padding_mask** -- List of indices specifying which input values should be ignored by the model.
-        """
-
-        if audio is None:
-            raise ValueError("`audio` is required.")
-
-        output_kwargs = self._merge_kwargs(
-            KyutaiSpeechToTextProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-        audio_kwargs = output_kwargs["audio_kwargs"]
-
-        # ensure audio in correct format
-        audio = make_list_of_audio(audio)
-
-        inputs = self.feature_extractor(
-            audio,
-            **audio_kwargs,
-        )
-
-        return inputs
+    valid_processor_kwargs = KyutaiSpeechToTextProcessorKwargs


 __all__ = ["KyutaiSpeechToTextProcessor"]
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@ -59,10 +59,6 @@ class LayoutLMv2Processor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@ -59,10 +59,6 @@ class LayoutLMv3Processor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@ -58,10 +58,6 @@ class LayoutXLMProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@ -101,7 +101,7 @@ class LlavaProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@ -111,7 +111,7 @@ class LlavaNextProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@ -123,9 +123,9 @@ class LlavaNextVideoProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
-        this method forwards the `videos` and `kwrags` arguments to LlavaNextVideoImageProcessor's
+        this method forwards the `videos` and `kwargs` arguments to LlavaNextVideoImageProcessor's
        [`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@ -121,7 +121,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@ -67,10 +67,6 @@ class MgpstrProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        self.char_tokenizer = tokenizer
        self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
--- a/src/transformers/models/musicgen/processing_musicgen.py
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@ -60,33 +60,9 @@ class MusicgenProcessor(ProcessorMixin):
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

-        audio = kwargs.pop("audio", None)
-        sampling_rate = kwargs.pop("sampling_rate", None)
-        text = kwargs.pop("text", None)
        if len(args) > 0:
-            audio = args[0]
-            args = args[1:]
-
-        if audio is None and text is None:
-            raise ValueError("You need to specify either an `audio` or `text` input to process.")
-
-        if text is not None:
-            inputs = self.tokenizer(text, **kwargs)
-
-        if audio is not None:
-            audio_inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
-
-        if audio is None:
-            return inputs
-
-        elif text is None:
-            return audio_inputs
-
-        else:
-            inputs["input_values"] = audio_inputs["input_values"]
-            if "padding_mask" in audio_inputs:
-                inputs["padding_mask"] = audio_inputs["padding_mask"]
-            return inputs
+            kwargs["audio"] = args[0]
+        return super().__call__(*args, **kwargs)

    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
@ -51,49 +51,16 @@ class MusicgenMelodyProcessor(ProcessorMixin):
    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)

-    def __call__(self, audio=None, text=None, **kwargs):
+    def __call__(self, *args, **kwargs):
        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
-        and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
-        `None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
-        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
-                of a NumPy array/PyTorch tensor, each audio should be a mono-stereo signal of shape (T), where T is the sample length of the audio.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            kwargs (*optional*):
-                Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
-                tokenizer.
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
-            - **attention_mask** -- List of token indices specifying which tokens should be attended to by the model when `text` is not `None`.
-            When only `audio` is specified, returns the timestamps attention mask.
+        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
+        argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
+        information.
        """

-        sampling_rate = kwargs.pop("sampling_rate", None)
-
-        if audio is None and text is None:
-            raise ValueError("You need to specify either an `audio` or `text` input to process.")
-
-        if text is not None:
-            inputs = self.tokenizer(text, **kwargs)
-        if audio is not None:
-            audio_inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)
-
-        if text is None:
-            return audio_inputs
-        elif audio is None:
-            return inputs
-        else:
-            inputs["input_features"] = audio_inputs["input_features"]
-            return inputs
+        if len(args) > 0:
+            kwargs["audio"] = args[0]
+        return super().__call__(*args, **kwargs)

    # Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor.batch_decode with padding_mask->attention_mask
    def batch_decode(self, *args, **kwargs):
--- a/src/transformers/models/oneformer/processing_oneformer.py
+++ b/src/transformers/models/oneformer/processing_oneformer.py
@ -48,11 +48,6 @@ class OneFormerProcessor(ProcessorMixin):
    def __init__(
        self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
    ):
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        self.max_seq_length = max_seq_length
        self.task_seq_length = task_seq_length

--- a/src/transformers/models/ovis2/processing_ovis2.py
+++ b/src/transformers/models/ovis2/processing_ovis2.py
@ -85,7 +85,7 @@ class Ovis2Processor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        Ovis2ImageProcessor's [`~Ovis2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@ -86,7 +86,7 @@ class Owlv2Processor(ProcessorMixin):
        """
        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@ -82,10 +82,6 @@ class OwlViTProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

@ -100,7 +96,7 @@ class OwlViTProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@ -125,10 +125,6 @@ class PaliGemmaProcessor(ProcessorMixin):
        chat_template=None,
        **kwargs,
    ):
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
        if not hasattr(image_processor, "image_seq_length"):
            raise ValueError("Image processor is missing an `image_seq_length` attribute.")

@ -161,7 +157,7 @@ class PaliGemmaProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@ -86,7 +86,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@ -127,7 +127,7 @@ class PixtralProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@ -893,7 +893,7 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

        Args:
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@ -105,7 +105,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

        Args:
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@ -87,7 +87,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
+        the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@ -100,7 +100,7 @@ class Qwen2VLProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

        Args:
--- a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
@ -16,7 +16,26 @@
 Audio/Text processor class for SeamlessM4T
 """

-from ...processing_utils import ProcessorMixin
+from typing import Optional, Union
+
+from ...audio_utils import AudioInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+
+
+logger = logging.get_logger(__name__)
+
+
+class SeamlessM4TTextKwargs(TextKwargs):
+    src_lang: Optional[str]
+    tgt_lang: Optional[str]
+
+
+class SeamlessM4TProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: SeamlessM4TTextKwargs
+    _defaults = {}


 class SeamlessM4TProcessor(ProcessorMixin):
@ -37,15 +56,23 @@ class SeamlessM4TProcessor(ProcessorMixin):

    feature_extractor_class = "SeamlessM4TFeatureExtractor"
    tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast")
+    valid_processor_kwargs = SeamlessM4TProcessorKwargs

    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)

-    def __call__(self, text=None, audios=None, src_lang=None, tgt_lang=None, **kwargs):
+    @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audios: Optional[AudioInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
        """
        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
        and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
-        `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
+        `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
        SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
        to the docstring of the above two methods for more information.

@ -58,14 +85,6 @@ class SeamlessM4TProcessor(ProcessorMixin):
                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
                and T the sample length of the audio.
-            src_lang (`str`, *optional*):
-                The language code of the input texts/audios. If not specified, the last `src_lang` specified will be
-                used.
-            tgt_lang (`str`, *optional*):
-                The code of the target language. If not specified, the last `tgt_lang` specified will be used.
-            kwargs (*optional*):
-                Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
-                tokenizer.
        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

@ -75,26 +94,16 @@ class SeamlessM4TProcessor(ProcessorMixin):
              `None`).
            - **input_features** -- Audio input features to be fed to a model. Returned when `audios` is not `None`.
        """
-        sampling_rate = kwargs.pop("sampling_rate", None)
-
-        if text is None and audios is None:
-            raise ValueError("You have to specify either text or audios. Both cannot be none.")
-        elif text is not None and audios is not None:
+        if text is not None and audios is not None:
            raise ValueError(
                "Text and audios are mututally exclusive when passed to `SeamlessM4T`. Specify one or another."
            )
-        elif text is not None:
-            if tgt_lang is not None:
-                self.tokenizer.tgt_lang = tgt_lang
-            if src_lang is not None:
-                self.tokenizer.src_lang = src_lang
-            encoding = self.tokenizer(text, **kwargs)
-
-            return encoding
-
-        else:
-            encoding = self.feature_extractor(audios, sampling_rate=sampling_rate, **kwargs)
-            return encoding
+        if audio is None and audios is not None:
+            logger.warning(
+                "Passing `audios` as keyword argument is deprecated and will be removed in v4.63, please pass `audio` instead."
+            )
+            audio = audios
+        return super().__call__(text=text, audio=audio, **kwargs)


 __all__ = ["SeamlessM4TProcessor"]
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@ -16,13 +16,7 @@
 Image/Text processor class for SigLIP.
 """

-from typing import Optional, Union
-
-from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType


 class SiglipProcessor(ProcessorMixin):
@ -46,79 +40,5 @@ class SiglipProcessor(ProcessorMixin):
    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)

-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        images: Optional[ImageInput] = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` argument to
-        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
-        if text is not None:
-            encoding = self.tokenizer(
-                text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
-            )
-
-        if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors)
-
-        if text is not None and images is not None:
-            encoding.update(image_features)
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
-

 __all__ = ["SiglipProcessor"]
--- a/src/transformers/models/siglip2/processing_siglip2.py
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@ -16,12 +16,9 @@
 Image/Text processor class for SigLIP2.
 """

-from typing import Optional, Union
+from typing import Optional

-from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin


 class Siglip2ImagesKwargs(ImagesKwargs, total=False):
@ -63,89 +60,10 @@ class Siglip2Processor(ProcessorMixin):

    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"
+    valid_processor_kwargs = Siglip2ProcessorKwargs

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)

-    def __call__(
-        self,
-        images: Optional[Union[ImageInput, list[ImageInput], list[list[ImageInput]]]] = None,
-        text: Optional[Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]]] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[Siglip2ProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` argument to
-        Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*, defaults to 64):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*, defaults to `True`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'pt'`):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-            - **pixel_attention_mask** -- Attention mask for the pixel values. Returned when `images` is not `None`.
-            - **spatial_shapes** -- The number of horizontal and vertical patches per image.
-              Returned when `images` is not `None`.
-        """
-        output_kwargs = self._merge_kwargs(
-            Siglip2ProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
-        if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
-        if images is not None:
-            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        if text is not None and images is not None:
-            encoding.update(image_features)
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
-

 __all__ = ["Siglip2Processor"]
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@ -60,10 +60,6 @@ class TrOCRProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@ -16,8 +16,18 @@
 Processor class for TVP.
 """

-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+
+
+class TvpProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "truncation": True,
+            "padding": "max_length",
+            "pad_to_max_length": True,
+            "return_token_type_ids": False,
+        },
+    }


 class TvpProcessor(ProcessorMixin):
@ -39,74 +49,8 @@ class TvpProcessor(ProcessorMixin):
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        super().__init__(image_processor, tokenizer)
-
-    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
-        TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring of
-        the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, `list[list[PIL.Image.Image]]`, `list[list[np.ndarray]]`,:
-                `list[list[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
-                of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
-                each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
-                channels.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
-        """
-
-        max_text_length = kwargs.pop("max_text_length", None)
-
-        if text is None and videos is None:
-            raise ValueError("You have to specify either text or videos. Both cannot be none.")
-
-        encoding = {}
-        if text is not None:
-            textual_input = self.tokenizer.batch_encode_plus(
-                text,
-                truncation=True,
-                padding="max_length",
-                max_length=max_text_length,
-                pad_to_max_length=True,
-                return_tensors=return_tensors,
-                return_token_type_ids=False,
-                **kwargs,
-            )
-            encoding.update(textual_input)
-
-        if videos is not None:
-            image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
-            encoding.update(image_features)
-
-        return BatchEncoding(data=encoding, tensor_type=return_tensors)
+        self.video_processor = image_processor

    def post_process_video_grounding(self, logits, video_durations):
        """
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@ -100,7 +100,7 @@ class VideoLlavaProcessor(ProcessorMixin):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@ -17,11 +17,23 @@ Processor class for ViLT.
 """

 import warnings
-from typing import Optional, Union

-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+
+
+class ViltProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_length": False,
+            "verbose": True,
+        },
+    }


 class ViltProcessor(ProcessorMixin):
@ -41,6 +53,7 @@ class ViltProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ViltImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+    valid_processor_kwargs = ViltProcessorKwargs

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        feature_extractor = None
@ -53,64 +66,9 @@ class ViltProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor

-    def __call__(
-        self,
-        images,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
-        encoding = self.tokenizer(
-            text=text,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            return_tensors=return_tensors,
-            **kwargs,
-        )
-        # add pixel_values + pixel_mask
-        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
-        encoding.update(encoding_image_processor)
-
-        return encoding
-
    @property
    def feature_extractor_class(self):
        warnings.warn(
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@ -17,11 +17,8 @@ Processor class for VisionTextDualEncoder
 """

 import warnings
-from typing import Optional, Union

-from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin


 class VisionTextDualEncoderProcessorKwargs(ProcessingKwargs, total=False):
@ -59,82 +56,9 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You have to specify an image_processor.")
-        if tokenizer is None:
-            raise ValueError("You have to specify a tokenizer.")
-
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor

-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[VisionTextDualEncoderProcessorKwargs],
-    ) -> BatchEncoding:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
-        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
-        output_kwargs = self._merge_kwargs(
-            VisionTextDualEncoderProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
-        if images is not None:
-            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(
-                data=dict(**image_features),
-                tensor_type=output_kwargs["common_kwargs"].get("return_tensors"),
-            )
-
    @property
    def feature_extractor_class(self):
        warnings.warn(
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@ -19,7 +19,6 @@ Image/Text processor class for XCLIP
 import warnings

 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding


 class XCLIPProcessor(ProcessorMixin):
@ -51,68 +50,10 @@ class XCLIPProcessor(ProcessorMixin):
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
-        if image_processor is None:
-            raise ValueError("You need to specify an `image_processor`.")
-        if tokenizer is None:
-            raise ValueError("You need to specify a `tokenizer`.")
-
        super().__init__(image_processor, tokenizer)
+        self.video_processor = self.image_processor
        self.current_processor = self.image_processor

-    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
-        VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
-        docstring of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, `list[list[PIL.Image.Image]]`, `list[list[np.ndarray]]`,:
-                `list[list[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
-                of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
-                each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
-                channels.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
-        """
-
-        if text is None and videos is None:
-            raise ValueError("You have to specify either text or videos. Both cannot be none.")
-
-        if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
-
-        if videos is not None:
-            image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
-
-        if text is not None and videos is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
-
    @property
    def feature_extractor_class(self):
        warnings.warn(
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@ -31,12 +31,12 @@ import numpy as np
 import typing_extensions
 from huggingface_hub.errors import EntryNotFoundError

-from .audio_utils import load_audio
+from .audio_utils import AudioInput, load_audio
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature
-from .image_utils import ChannelDimension, is_vision_available
+from .image_utils import ChannelDimension, ImageInput, is_vision_available
 from .utils.chat_template_utils import render_jinja_template
-from .video_utils import VideoMetadata
+from .video_utils import VideoInput, VideoMetadata


 if is_vision_available():
@ -335,7 +335,8 @@ class CommonKwargs(TypedDict, total=False):
 class ProcessingKwargs(TypedDict, total=False):
    """
    Base class for kwargs passing to processors.
-    A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
+    In case a model has specific kwargs that are not present in the base class or default values for existing keys,
+    it should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
        1) Additional typed keys and that this model requires to process inputs.
        2) Default values for existing keys under a `_defaults` attribute.
    New keys have to be defined as follows to ensure type hinting is done correctly.
@ -370,6 +371,8 @@ class ProcessingKwargs(TypedDict, total=False):

    """

+    _defaults = {}
+
    common_kwargs: CommonKwargs = {
        **CommonKwargs.__annotations__,
    }
@ -499,6 +502,7 @@ class ProcessorMixin(PushToHubMixin):
    feature_extractor_class = None
    tokenizer_class = None
    _auto_class = None
+    valid_processor_kwargs = ProcessingKwargs

    # args have to match the attributes class attribute
    def __init__(self, *args, **kwargs):
@ -539,6 +543,68 @@ class ProcessorMixin(PushToHubMixin):
            self.check_argument_for_proper_class(attribute_name, arg)
            setattr(self, attribute_name, arg)

+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        videos: Optional[VideoInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
+        """
+        Main method to prepare for model inputs. This method forwards the each modality argument to its own processor
+        along with `kwargs`. Please refer to the docstring of the each processor attributes for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
+                tensor.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] object with processed inputs in a dict format.
+        """
+        if images is None and text is None and videos is None and audio is None:
+            raise ValueError(f"You need to provide at least one input to call {self.__class__.__name__}")
+
+        kwargs = self._merge_kwargs(
+            self.valid_processor_kwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs if hasattr(self, "tokenizer") else {},
+            **kwargs,
+        )
+
+        attribute_to_kwargs = {
+            "tokenizer": (text, "text_kwargs"),
+            "image_processor": (images, "images_kwargs"),
+            "video_processor": (videos, "videos_kwargs"),
+            "feature_extractor": (audio, "audio_kwargs"),
+        }
+        outputs = {}
+        for attribute_name in self.attributes:
+            attribute = getattr(self, attribute_name, None)
+            input_data, input_kwargs = attribute_to_kwargs[attribute_name]
+            if input_data is not None and attribute is not None:
+                attribute_output = attribute(input_data, **kwargs[input_kwargs])
+                outputs.update(attribute_output)
+
+        return BatchFeature(outputs)
+
    def check_argument_for_proper_class(self, argument_name, argument):
        """
        Checks the passed argument's class against the expected transformers class. In case of an unexpected
--- a/tests/models/chinese_clip/test_processing_chinese_clip.py
+++ b/tests/models/chinese_clip/test_processing_chinese_clip.py
@ -184,7 +184,7 @@ class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        inputs = processor(text=input_str, images=image_input)

-        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+        self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"})

        # test if it raises when no input is passed
        with pytest.raises(ValueError):
--- a/tests/models/flava/test_processing_flava.py
+++ b/tests/models/flava/test_processing_flava.py
@ -186,21 +186,21 @@ class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        inputs = processor(text=input_str, images=image_input)

-        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+        self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"})

        # add extra args
        inputs = processor(text=input_str, images=image_input, return_codebook_pixels=True, return_image_mask=True)

-        self.assertListEqual(
-            list(inputs.keys()),
-            [
+        self.assertSetEqual(
+            set(inputs.keys()),
+            {
                "input_ids",
                "token_type_ids",
                "attention_mask",
                "pixel_values",
                "codebook_pixel_values",
                "bool_masked_pos",
-            ],
+            },
        )

        # test if it raises when no input is passed
--- a/tests/models/git/test_processing_git.py
+++ b/tests/models/git/test_processing_git.py
@ -112,7 +112,7 @@ class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        inputs = processor(text=input_str, images=image_input)

-        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
+        self.assertSetEqual(set(inputs.keys()), {"input_ids", "attention_mask", "pixel_values"})

        # test if it raises when no input is passed
        with pytest.raises(ValueError):
--- a/tests/models/grounding_dino/test_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processing_grounding_dino.py
@ -236,8 +236,8 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        inputs = processor(text=input_str, images=image_input)

-        self.assertListEqual(
-            list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"]
+        self.assertSetEqual(
+            set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"}
        )

        # test if it raises when no input is passed
--- a/tests/models/janus/test_processing_janus.py
+++ b/tests/models/janus/test_processing_janus.py
@ -38,6 +38,7 @@ class JanusProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class.from_pretrained(
            "deepseek-community/Janus-Pro-1B",
            extra_special_tokens=special_image_tokens,
+            **self.prepare_processor_dict(),
        )
        # Set the processor to use the default system prompt to False as it's used based on input modality.
        # Hence set to False to avoid any issues in the test irrespective of inputs.
--- a/tests/models/vision_text_dual_encoder/test_processing_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_processing_vision_text_dual_encoder.py
@ -149,7 +149,7 @@ class VisionTextDualEncoderProcessorTest(ProcessorTesterMixin, unittest.TestCase

        inputs = processor(text=input_str, images=image_input)

-        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "token_type_ids", "attention_mask"])

        # test if it raises when no input is passed
        with self.assertRaises(ValueError):