[processors] Unbloating simple processors (#40377)

* modularize processor - step 1

* typos

* why raise error, super call check it also

* tiny update

* fix copies

* fix style and test

* lost an import / fix copies

* fix tests

* oops deleted accidentally
This commit is contained in:
Raushan Turganbay
2025-09-10 10:37:19 +02:00
committed by GitHub
parent c52889bd51
commit 08edec9f7d
66 changed files with 286 additions and 1303 deletions

View File

@ -16,11 +16,7 @@
Image/Text processor class for ALIGN
"""
from typing import Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
class AlignProcessorKwargs(ProcessingKwargs, total=False):
@ -66,74 +62,10 @@ class AlignProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "EfficientNetImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
valid_processor_kwargs = AlignProcessorKwargs
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[AlignProcessorKwargs],
) -> BatchEncoding:
"""
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` arguments to
EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
to the docstring of the above two methods for more information.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `list[str]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You must specify either text or images.")
output_kwargs = self._merge_kwargs(
AlignProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# then, we can pass correct kwargs to each processor
if text is not None:
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
if images is not None:
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
# BC for explicit return_tensors
if "return_tensors" in output_kwargs["common_kwargs"]:
return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
__all__ = ["AlignProcessor"]

View File

@ -16,18 +16,10 @@
Image/Text processor class for AltCLIP
"""
from typing import Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...processing_utils import ProcessorMixin
from ...utils.deprecation import deprecate_kwarg
class AltClipProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {}
class AltCLIPProcessor(ProcessorMixin):
r"""
Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single
@ -49,80 +41,7 @@ class AltCLIPProcessor(ProcessorMixin):
@deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
def __init__(self, image_processor=None, tokenizer=None):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[AltClipProcessorKwargs],
) -> BatchEncoding:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.
Args:
images (`ImageInput`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You must specify either text or images.")
if text is None and images is None:
raise ValueError("You must specify either text or images.")
output_kwargs = self._merge_kwargs(
AltClipProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if text is not None:
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
if images is not None:
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
# BC for explicit return_tensors
if "return_tensors" in output_kwargs["common_kwargs"]:
return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
__all__ = ["AltCLIPProcessor"]

View File

@ -16,10 +16,7 @@
Processor class for BridgeTower.
"""
from typing import Union
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
@ -60,35 +57,10 @@ class BridgeTowerProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "BridgeTowerImageProcessor"
tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
valid_processor_kwargs = BridgeTowerProcessorKwargs
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[BridgeTowerProcessorKwargs],
) -> BatchEncoding:
"""
This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and
[`RobertaTokenizerFast.__call__`] to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
"""
output_kwargs = self._merge_kwargs(
BridgeTowerProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
# add pixel_values + pixel_mask
encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
encoding.update(encoding_image_processor)
return encoding
__all__ = ["BridgeTowerProcessor"]

View File

@ -16,11 +16,22 @@
Processor class for Bros.
"""
from typing import Optional, Union
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class BrosProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {
"add_special_tokens": True,
"padding": False,
"stride": 0,
"return_overflowing_tokens": False,
"return_special_tokens_mask": False,
"return_offsets_mapping": False,
"return_length": False,
"verbose": True,
},
}
class BrosProcessor(ProcessorMixin):
@ -37,6 +48,7 @@ class BrosProcessor(ProcessorMixin):
attributes = ["tokenizer"]
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
valid_processor_kwargs = BrosProcessorKwargs
def __init__(self, tokenizer=None, **kwargs):
if tokenizer is None:
@ -44,50 +56,5 @@ class BrosProcessor(ProcessorMixin):
super().__init__(tokenizer)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchEncoding:
"""
This method uses [`BertTokenizerFast.__call__`] to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
"""
encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
return encoding
__all__ = ["BrosProcessor"]

View File

@ -99,7 +99,7 @@ class ChameleonProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -17,15 +17,8 @@ Image/Text processor class for Chinese-CLIP
"""
import warnings
from typing import Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
class ChineseClipProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {}
from ...processing_utils import ProcessorMixin
class ChineseCLIPProcessor(ProcessorMixin):
@ -58,79 +51,10 @@ class ChineseCLIPProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
images: Optional[ImageInput] = None,
audio=None,
videos=None,
**kwargs: Unpack[ChineseClipProcessorKwargs],
) -> BatchEncoding:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
output_kwargs = self._merge_kwargs(
ChineseClipProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if text is not None:
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
if images is not None:
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
# BC for explicit return_tensors
if "return_tensors" in output_kwargs["common_kwargs"]:
return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
@property
def feature_extractor_class(self):
warnings.warn(

View File

@ -16,8 +16,16 @@
Audio/Text processor class for CLAP
"""
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from typing import Optional, Union
from ...audio_utils import AudioInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import logging
from ...utils.deprecation import deprecate_kwarg
logger = logging.get_logger(__name__)
class ClapProcessor(ProcessorMixin):
@ -40,61 +48,28 @@ class ClapProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
@deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
def __call__(
self,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audios: Optional[AudioInput] = None,
audio: Optional[AudioInput] = None,
**kwargs: Unpack[ProcessingKwargs],
):
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
docstring of the above two methods for more information.
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
audios (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
and T the sample length of the audio.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **audio_features** -- Audio features to be fed to a model. Returned when `audios` is not `None`.
Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text`
argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more
information.
"""
sampling_rate = kwargs.pop("sampling_rate", None)
if text is None and audios is None:
raise ValueError("You have to specify either text or audios. Both cannot be none.")
if text is not None:
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if audios is not None:
audio_features = self.feature_extractor(
audios, sampling_rate=sampling_rate, return_tensors=return_tensors, **kwargs
# The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check
# again that the correct naming is used
if audios is not None and audio is None:
logger.warning(
"Using `audios` keyword argument is deprecated when calling ClapProcessor, instead use `audio`."
)
audio = audios
if text is not None and audios is not None:
encoding.update(audio_features)
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**audio_features), tensor_type=return_tensors)
return super().__call__(text=text, audio=audio, **kwargs)
__all__ = ["ClapProcessor"]

View File

@ -19,7 +19,6 @@ Image/Text processor class for CLIP
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class CLIPProcessor(ProcessorMixin):
@ -51,71 +50,9 @@ class CLIPProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
tokenizer_kwargs, image_processor_kwargs = {}, {}
if kwargs:
tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys}
image_processor_kwargs = {
k: v for k, v in kwargs.items() if k in self.image_processor._valid_processor_keys
}
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
if text is not None:
encoding = self.tokenizer(text, return_tensors=return_tensors, **tokenizer_kwargs)
if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **image_processor_kwargs)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
@property
def feature_extractor_class(self):
warnings.warn(

View File

@ -51,10 +51,6 @@ class CLIPSegProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
@ -62,7 +58,7 @@ class CLIPSegProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
the above two methods for more information.

View File

@ -18,6 +18,10 @@ Processor class for CLVP
"""
from ...processing_utils import ProcessorMixin
from ...utils import logging
logger = logging.get_logger(__name__)
class ClvpProcessor(ProcessorMixin):
@ -36,11 +40,6 @@ class ClvpProcessor(ProcessorMixin):
feature_extractor_class = "ClvpFeatureExtractor"
tokenizer_class = "ClvpTokenizer"
model_input_names = [
"input_ids",
"input_features",
"attention_mask",
]
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
@ -51,27 +50,13 @@ class ClvpProcessor(ProcessorMixin):
argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
information.
"""
raw_speech = kwargs.pop("raw_speech", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if raw_speech is None and text is None:
raise ValueError("You need to specify either an `raw_speech` or `text` input to process.")
if raw_speech is not None:
inputs = self.feature_extractor(raw_speech, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif raw_speech is None:
return encodings
else:
inputs["input_ids"] = encodings["input_ids"]
inputs["attention_mask"] = encodings["attention_mask"]
return inputs
logger.warning(
"Using `raw_speech` keyword argument is deprecated when calling ClvpProcessor, instead use `audio`."
)
kwargs["audio"] = raw_speech
return super().__call__(*args, **kwargs)
__all__ = ["ClvpProcessor"]

View File

@ -106,10 +106,6 @@ class ColPaliProcessor(ProcessorMixin):
query_prefix: str = "Question: ",
):
super().__init__(image_processor, tokenizer, chat_template=chat_template)
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
if not hasattr(image_processor, "image_seq_length"):
raise ValueError("Image processor is missing an `image_seq_length` attribute.")

View File

@ -249,7 +249,7 @@ class DeepseekVLProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

View File

@ -78,7 +78,7 @@ class DeepseekVLProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

View File

@ -926,7 +926,7 @@ class DeepseekVLHybridProcessor(DeepseekVLProcessor):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

View File

@ -78,7 +78,7 @@ class DeepseekVLHybridProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

View File

@ -65,10 +65,6 @@ class DonutProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor

View File

@ -102,7 +102,7 @@ class Emu3Processor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -17,12 +17,39 @@ Image/Text processor class for FLAVA
"""
import warnings
from collections.abc import Iterable
from typing import Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
class FlavaImagesKwargs(ImagesKwargs):
# Mask related params
return_image_mask: Optional[bool]
input_size_patches: Optional[int]
total_mask_patches: Optional[int]
mask_group_min_patches: Optional[int]
mask_group_max_patches: Optional[int]
mask_group_min_aspect_ratio: Optional[float]
mask_group_max_aspect_ratio: Optional[float]
# Codebook related params
return_codebook_pixels: Optional[bool]
codebook_do_resize: Optional[bool]
codebook_size: Optional[bool]
codebook_resample: Optional[int]
codebook_do_center_crop: Optional[bool]
codebook_crop_size: Optional[int]
codebook_do_rescale: Optional[bool]
codebook_rescale_factor: Optional[Union[int, float]]
codebook_do_map_pixels: Optional[bool]
codebook_do_normalize: Optional[bool]
codebook_image_mean: Optional[Union[float, Iterable[float]]]
codebook_image_std: Optional[Union[float, Iterable[float]]]
class FlavaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: FlavaImagesKwargs
_defaults = {}
class FlavaProcessor(ProcessorMixin):
@ -40,6 +67,7 @@ class FlavaProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "FlavaImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
valid_processor_kwargs = FlavaProcessorKwargs
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
@ -52,82 +80,9 @@ class FlavaProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_image_mask: Optional[bool] = None,
return_codebook_pixels: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
"""
This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
[`BertTokenizerFast.__call__`] to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
"""
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
if text is not None:
encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
if images is not None:
image_features = self.image_processor(
images,
return_image_mask=return_image_mask,
return_codebook_pixels=return_codebook_pixels,
return_tensors=return_tensors,
**kwargs,
)
if text is not None and images is not None:
encoding.update(image_features)
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
@property
def feature_extractor_class(self):
warnings.warn(

View File

@ -357,7 +357,7 @@ class Florence2Processor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -158,7 +158,7 @@ class Florence2Processor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -16,20 +16,7 @@
Image/Text processor class for GIT
"""
from typing import Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import logging
class GitProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {}
logger = logging.get_logger(__name__)
from ...processing_utils import ProcessorMixin
class GitProcessor(ProcessorMixin):
@ -54,65 +41,5 @@ class GitProcessor(ProcessorMixin):
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[GitProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
output_kwargs = self._merge_kwargs(
GitProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
data = {}
if text is not None:
text_features = self.tokenizer(text, **output_kwargs["text_kwargs"])
data.update(text_features)
if images is not None:
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
data.update(image_features)
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
__all__ = ["GitProcessor"]

View File

@ -144,7 +144,7 @@ class GotOcr2Processor(ProcessorMixin):
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
Args:

View File

@ -20,7 +20,6 @@ import pathlib
import warnings
from typing import TYPE_CHECKING, Optional, Union
from ...image_processing_utils import BatchFeature
from ...image_transforms import center_to_corners_format
from ...image_utils import AnnotationFormat, ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
@ -144,6 +143,7 @@ class GroundingDinoProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "GroundingDinoImageProcessor"
tokenizer_class = "AutoTokenizer"
valid_processor_kwargs = GroundingDinoProcessorKwargs
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
@ -152,8 +152,6 @@ class GroundingDinoProcessor(ProcessorMixin):
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[GroundingDinoProcessorKwargs],
) -> BatchEncoding:
"""
@ -170,33 +168,9 @@ class GroundingDinoProcessor(ProcessorMixin):
- A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog.").
- A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]).
"""
if images is None and text is None:
raise ValueError("You must specify either text or images.")
output_kwargs = self._merge_kwargs(
GroundingDinoProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# Get only text
if images is not None:
encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
else:
encoding_image_processor = BatchFeature()
if text is not None:
text = self._preprocess_input_text(text)
text_encoding = self.tokenizer(
text=text,
**output_kwargs["text_kwargs"],
)
else:
text_encoding = BatchEncoding()
text_encoding.update(encoding_image_processor)
return text_encoding
return super().__call__(images=images, text=text, **kwargs)
def _preprocess_input_text(self, text):
"""

View File

@ -214,11 +214,6 @@ class IdeficsProcessor(ProcessorMixin):
tokenizer_class = "LlamaTokenizerFast"
def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
self.image_token_id = (

View File

@ -90,11 +90,6 @@ class Idefics2Processor(ProcessorMixin):
def __init__(
self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
if not hasattr(tokenizer, "image_token"):
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
self.image_token = AddedToken("<image>", normalized=False, special=True).content

View File

@ -138,11 +138,6 @@ class Idefics3Processor(ProcessorMixin):
def __init__(
self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
self.image_token = AddedToken("<image>", normalized=False, special=True).content
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content

View File

@ -165,7 +165,7 @@ class InternVLProcessor(ProcessorMixin):
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
Args:

View File

@ -88,7 +88,7 @@ class JanusProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
JanusImageProcessor's [`~JanusImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

View File

@ -13,10 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
from ...audio_utils import AudioInput, make_list_of_audio
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ProcessingKwargs, ProcessorMixin
class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
@ -38,53 +36,7 @@ class KyutaiSpeechToTextProcessor(ProcessorMixin):
feature_extractor_class = "KyutaiSpeechToTextFeatureExtractor"
tokenizer_class = "PreTrainedTokenizerFast"
def __call__(
self,
audio: Optional[AudioInput] = None,
**kwargs: Unpack[KyutaiSpeechToTextProcessorKwargs],
):
r"""
Main method to prepare audio to be fed as input to the model. This method forwards the `audio`
arguments to KyutaiSpeechToTextFeatureExtractor's [`~KyutaiSpeechToTextFeatureExtractor.__call__`]. Please refer
to the docstring of the above method for more information.
Args:
audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
tensor.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
- **padding_mask** -- List of indices specifying which input values should be ignored by the model.
"""
if audio is None:
raise ValueError("`audio` is required.")
output_kwargs = self._merge_kwargs(
KyutaiSpeechToTextProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
audio_kwargs = output_kwargs["audio_kwargs"]
# ensure audio in correct format
audio = make_list_of_audio(audio)
inputs = self.feature_extractor(
audio,
**audio_kwargs,
)
return inputs
valid_processor_kwargs = KyutaiSpeechToTextProcessorKwargs
__all__ = ["KyutaiSpeechToTextProcessor"]

View File

@ -59,10 +59,6 @@ class LayoutLMv2Processor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)

View File

@ -59,10 +59,6 @@ class LayoutLMv3Processor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)

View File

@ -58,10 +58,6 @@ class LayoutXLMProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)

View File

@ -101,7 +101,7 @@ class LlavaProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -111,7 +111,7 @@ class LlavaNextProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -123,9 +123,9 @@ class LlavaNextVideoProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
this method forwards the `videos` and `kwrags` arguments to LlavaNextVideoImageProcessor's
this method forwards the `videos` and `kwargs` arguments to LlavaNextVideoImageProcessor's
[`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -121,7 +121,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -67,10 +67,6 @@ class MgpstrProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
self.char_tokenizer = tokenizer
self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

View File

@ -60,33 +60,9 @@ class MusicgenProcessor(ProcessorMixin):
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if text is not None:
inputs = self.tokenizer(text, **kwargs)
if audio is not None:
audio_inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if audio is None:
return inputs
elif text is None:
return audio_inputs
else:
inputs["input_values"] = audio_inputs["input_values"]
if "padding_mask" in audio_inputs:
inputs["padding_mask"] = audio_inputs["padding_mask"]
return inputs
kwargs["audio"] = args[0]
return super().__call__(*args, **kwargs)
def batch_decode(self, *args, **kwargs):
"""

View File

@ -51,49 +51,16 @@ class MusicgenMelodyProcessor(ProcessorMixin):
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
def __call__(self, audio=None, text=None, **kwargs):
def __call__(self, *args, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
`None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
Args:
audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
of a NumPy array/PyTorch tensor, each audio should be a mono-stereo signal of shape (T), where T is the sample length of the audio.
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
kwargs (*optional*):
Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
tokenizer.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
- **attention_mask** -- List of token indices specifying which tokens should be attended to by the model when `text` is not `None`.
When only `audio` is specified, returns the timestamps attention mask.
Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
information.
"""
sampling_rate = kwargs.pop("sampling_rate", None)
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if text is not None:
inputs = self.tokenizer(text, **kwargs)
if audio is not None:
audio_inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)
if text is None:
return audio_inputs
elif audio is None:
return inputs
else:
inputs["input_features"] = audio_inputs["input_features"]
return inputs
if len(args) > 0:
kwargs["audio"] = args[0]
return super().__call__(*args, **kwargs)
# Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor.batch_decode with padding_mask->attention_mask
def batch_decode(self, *args, **kwargs):

View File

@ -48,11 +48,6 @@ class OneFormerProcessor(ProcessorMixin):
def __init__(
self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
self.max_seq_length = max_seq_length
self.task_seq_length = task_seq_length

View File

@ -85,7 +85,7 @@ class Ovis2Processor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
Ovis2ImageProcessor's [`~Ovis2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -86,7 +86,7 @@ class Owlv2Processor(ProcessorMixin):
"""
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -82,10 +82,6 @@ class OwlViTProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
@ -100,7 +96,7 @@ class OwlViTProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -125,10 +125,6 @@ class PaliGemmaProcessor(ProcessorMixin):
chat_template=None,
**kwargs,
):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
if not hasattr(image_processor, "image_seq_length"):
raise ValueError("Image processor is missing an `image_seq_length` attribute.")
@ -161,7 +157,7 @@ class PaliGemmaProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -86,7 +86,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

View File

@ -127,7 +127,7 @@ class PixtralProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -893,7 +893,7 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:

View File

@ -105,7 +105,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:

View File

@ -87,7 +87,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -100,7 +100,7 @@ class Qwen2VLProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:

View File

@ -16,7 +16,26 @@
Audio/Text processor class for SeamlessM4T
"""
from ...processing_utils import ProcessorMixin
from typing import Optional, Union
from ...audio_utils import AudioInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import logging
from ...utils.deprecation import deprecate_kwarg
logger = logging.get_logger(__name__)
class SeamlessM4TTextKwargs(TextKwargs):
src_lang: Optional[str]
tgt_lang: Optional[str]
class SeamlessM4TProcessorKwargs(ProcessingKwargs, total=False):
text_kwargs: SeamlessM4TTextKwargs
_defaults = {}
class SeamlessM4TProcessor(ProcessorMixin):
@ -37,15 +56,23 @@ class SeamlessM4TProcessor(ProcessorMixin):
feature_extractor_class = "SeamlessM4TFeatureExtractor"
tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast")
valid_processor_kwargs = SeamlessM4TProcessorKwargs
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def __call__(self, text=None, audios=None, src_lang=None, tgt_lang=None, **kwargs):
@deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
def __call__(
self,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
audios: Optional[AudioInput] = None,
audio: Optional[AudioInput] = None,
**kwargs: Unpack[ProcessingKwargs],
):
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
`None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
`None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
to the docstring of the above two methods for more information.
@ -58,14 +85,6 @@ class SeamlessM4TProcessor(ProcessorMixin):
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
and T the sample length of the audio.
src_lang (`str`, *optional*):
The language code of the input texts/audios. If not specified, the last `src_lang` specified will be
used.
tgt_lang (`str`, *optional*):
The code of the target language. If not specified, the last `tgt_lang` specified will be used.
kwargs (*optional*):
Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
tokenizer.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
@ -75,26 +94,16 @@ class SeamlessM4TProcessor(ProcessorMixin):
`None`).
- **input_features** -- Audio input features to be fed to a model. Returned when `audios` is not `None`.
"""
sampling_rate = kwargs.pop("sampling_rate", None)
if text is None and audios is None:
raise ValueError("You have to specify either text or audios. Both cannot be none.")
elif text is not None and audios is not None:
if text is not None and audios is not None:
raise ValueError(
"Text and audios are mututally exclusive when passed to `SeamlessM4T`. Specify one or another."
)
elif text is not None:
if tgt_lang is not None:
self.tokenizer.tgt_lang = tgt_lang
if src_lang is not None:
self.tokenizer.src_lang = src_lang
encoding = self.tokenizer(text, **kwargs)
return encoding
else:
encoding = self.feature_extractor(audios, sampling_rate=sampling_rate, **kwargs)
return encoding
if audio is None and audios is not None:
logger.warning(
"Passing `audios` as keyword argument is deprecated and will be removed in v4.63, please pass `audio` instead."
)
audio = audios
return super().__call__(text=text, audio=audio, **kwargs)
__all__ = ["SeamlessM4TProcessor"]

View File

@ -16,13 +16,7 @@
Image/Text processor class for SigLIP.
"""
from typing import Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class SiglipProcessor(ProcessorMixin):
@ -46,79 +40,5 @@ class SiglipProcessor(ProcessorMixin):
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
images: Optional[ImageInput] = None,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` argument to
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`, *optional*):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
if text is not None:
encoding = self.tokenizer(
text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
)
if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors)
if text is not None and images is not None:
encoding.update(image_features)
return encoding
elif text is not None:
return encoding
else:
return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
__all__ = ["SiglipProcessor"]

View File

@ -16,12 +16,9 @@
Image/Text processor class for SigLIP2.
"""
from typing import Optional, Union
from typing import Optional
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
class Siglip2ImagesKwargs(ImagesKwargs, total=False):
@ -63,89 +60,10 @@ class Siglip2Processor(ProcessorMixin):
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
valid_processor_kwargs = Siglip2ProcessorKwargs
def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)
def __call__(
self,
images: Optional[Union[ImageInput, list[ImageInput], list[list[ImageInput]]]] = None,
text: Optional[Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Siglip2ProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` argument to
Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*, defaults to 64):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`, *optional*, defaults to `True`):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'pt'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **pixel_attention_mask** -- Attention mask for the pixel values. Returned when `images` is not `None`.
- **spatial_shapes** -- The number of horizontal and vertical patches per image.
Returned when `images` is not `None`.
"""
output_kwargs = self._merge_kwargs(
Siglip2ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
if text is not None:
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
if images is not None:
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
if text is not None and images is not None:
encoding.update(image_features)
return encoding
elif text is not None:
return encoding
else:
return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
__all__ = ["Siglip2Processor"]

View File

@ -60,10 +60,6 @@ class TrOCRProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor

View File

@ -16,8 +16,18 @@
Processor class for TVP.
"""
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from ...processing_utils import ProcessingKwargs, ProcessorMixin
class TvpProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {
"truncation": True,
"padding": "max_length",
"pad_to_max_length": True,
"return_token_type_ids": False,
},
}
class TvpProcessor(ProcessorMixin):
@ -39,74 +49,8 @@ class TvpProcessor(ProcessorMixin):
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring of
the above two methods for more information.
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
videos (`list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, `list[list[PIL.Image.Image]]`, `list[list[np.ndarray]]`,:
`list[list[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
channels.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
"""
max_text_length = kwargs.pop("max_text_length", None)
if text is None and videos is None:
raise ValueError("You have to specify either text or videos. Both cannot be none.")
encoding = {}
if text is not None:
textual_input = self.tokenizer.batch_encode_plus(
text,
truncation=True,
padding="max_length",
max_length=max_text_length,
pad_to_max_length=True,
return_tensors=return_tensors,
return_token_type_ids=False,
**kwargs,
)
encoding.update(textual_input)
if videos is not None:
image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
encoding.update(image_features)
return BatchEncoding(data=encoding, tensor_type=return_tensors)
self.video_processor = image_processor
def post_process_video_grounding(self, logits, video_durations):
"""

View File

@ -100,7 +100,7 @@ class VideoLlavaProcessor(ProcessorMixin):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

View File

@ -17,11 +17,23 @@ Processor class for ViLT.
"""
import warnings
from typing import Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
from ...processing_utils import ProcessingKwargs, ProcessorMixin
class ViltProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {
"add_special_tokens": True,
"padding": False,
"stride": 0,
"return_overflowing_tokens": False,
"return_special_tokens_mask": False,
"return_offsets_mapping": False,
"return_length": False,
"verbose": True,
},
}
class ViltProcessor(ProcessorMixin):
@ -41,6 +53,7 @@ class ViltProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "ViltImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
valid_processor_kwargs = ViltProcessorKwargs
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
@ -53,64 +66,9 @@ class ViltProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchEncoding:
"""
This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
[`BertTokenizerFast.__call__`] to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
"""
encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
# add pixel_values + pixel_mask
encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
encoding.update(encoding_image_processor)
return encoding
@property
def feature_extractor_class(self):
warnings.warn(

View File

@ -17,11 +17,8 @@ Processor class for VisionTextDualEncoder
"""
import warnings
from typing import Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
class VisionTextDualEncoderProcessorKwargs(ProcessingKwargs, total=False):
@ -59,82 +56,9 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You have to specify an image_processor.")
if tokenizer is None:
raise ValueError("You have to specify a tokenizer.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[VisionTextDualEncoderProcessorKwargs],
) -> BatchEncoding:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
output_kwargs = self._merge_kwargs(
VisionTextDualEncoderProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if text is not None:
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
if images is not None:
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(
data=dict(**image_features),
tensor_type=output_kwargs["common_kwargs"].get("return_tensors"),
)
@property
def feature_extractor_class(self):
warnings.warn(

View File

@ -19,7 +19,6 @@ Image/Text processor class for XCLIP
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class XCLIPProcessor(ProcessorMixin):
@ -51,68 +50,10 @@ class XCLIPProcessor(ProcessorMixin):
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.video_processor = self.image_processor
self.current_processor = self.image_processor
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
docstring of the above two methods for more information.
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
videos (`list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, `list[list[PIL.Image.Image]]`, `list[list[np.ndarray]]`,:
`list[list[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
channels.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
"""
if text is None and videos is None:
raise ValueError("You have to specify either text or videos. Both cannot be none.")
if text is not None:
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if videos is not None:
image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
if text is not None and videos is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
@property
def feature_extractor_class(self):
warnings.warn(

View File

@ -31,12 +31,12 @@ import numpy as np
import typing_extensions
from huggingface_hub.errors import EntryNotFoundError
from .audio_utils import load_audio
from .audio_utils import AudioInput, load_audio
from .dynamic_module_utils import custom_object_save
from .feature_extraction_utils import BatchFeature
from .image_utils import ChannelDimension, is_vision_available
from .image_utils import ChannelDimension, ImageInput, is_vision_available
from .utils.chat_template_utils import render_jinja_template
from .video_utils import VideoMetadata
from .video_utils import VideoInput, VideoMetadata
if is_vision_available():
@ -335,7 +335,8 @@ class CommonKwargs(TypedDict, total=False):
class ProcessingKwargs(TypedDict, total=False):
"""
Base class for kwargs passing to processors.
A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
In case a model has specific kwargs that are not present in the base class or default values for existing keys,
it should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
1) Additional typed keys and that this model requires to process inputs.
2) Default values for existing keys under a `_defaults` attribute.
New keys have to be defined as follows to ensure type hinting is done correctly.
@ -370,6 +371,8 @@ class ProcessingKwargs(TypedDict, total=False):
"""
_defaults = {}
common_kwargs: CommonKwargs = {
**CommonKwargs.__annotations__,
}
@ -499,6 +502,7 @@ class ProcessorMixin(PushToHubMixin):
feature_extractor_class = None
tokenizer_class = None
_auto_class = None
valid_processor_kwargs = ProcessingKwargs
# args have to match the attributes class attribute
def __init__(self, *args, **kwargs):
@ -539,6 +543,68 @@ class ProcessorMixin(PushToHubMixin):
self.check_argument_for_proper_class(attribute_name, arg)
setattr(self, attribute_name, arg)
def __call__(
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
videos: Optional[VideoInput] = None,
audio: Optional[AudioInput] = None,
**kwargs: Unpack[ProcessingKwargs],
):
"""
Main method to prepare for model inputs. This method forwards the each modality argument to its own processor
along with `kwargs`. Please refer to the docstring of the each processor attributes for more information.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
tensor.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] object with processed inputs in a dict format.
"""
if images is None and text is None and videos is None and audio is None:
raise ValueError(f"You need to provide at least one input to call {self.__class__.__name__}")
kwargs = self._merge_kwargs(
self.valid_processor_kwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs if hasattr(self, "tokenizer") else {},
**kwargs,
)
attribute_to_kwargs = {
"tokenizer": (text, "text_kwargs"),
"image_processor": (images, "images_kwargs"),
"video_processor": (videos, "videos_kwargs"),
"feature_extractor": (audio, "audio_kwargs"),
}
outputs = {}
for attribute_name in self.attributes:
attribute = getattr(self, attribute_name, None)
input_data, input_kwargs = attribute_to_kwargs[attribute_name]
if input_data is not None and attribute is not None:
attribute_output = attribute(input_data, **kwargs[input_kwargs])
outputs.update(attribute_output)
return BatchFeature(outputs)
def check_argument_for_proper_class(self, argument_name, argument):
"""
Checks the passed argument's class against the expected transformers class. In case of an unexpected

View File

@ -184,7 +184,7 @@ class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"})
# test if it raises when no input is passed
with pytest.raises(ValueError):

View File

@ -186,21 +186,21 @@ class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"})
# add extra args
inputs = processor(text=input_str, images=image_input, return_codebook_pixels=True, return_image_mask=True)
self.assertListEqual(
list(inputs.keys()),
[
self.assertSetEqual(
set(inputs.keys()),
{
"input_ids",
"token_type_ids",
"attention_mask",
"pixel_values",
"codebook_pixel_values",
"bool_masked_pos",
],
},
)
# test if it raises when no input is passed

View File

@ -112,7 +112,7 @@ class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
self.assertSetEqual(set(inputs.keys()), {"input_ids", "attention_mask", "pixel_values"})
# test if it raises when no input is passed
with pytest.raises(ValueError):

View File

@ -236,8 +236,8 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(
list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"]
self.assertSetEqual(
set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"}
)
# test if it raises when no input is passed

View File

@ -38,6 +38,7 @@ class JanusProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor = self.processor_class.from_pretrained(
"deepseek-community/Janus-Pro-1B",
extra_special_tokens=special_image_tokens,
**self.prepare_processor_dict(),
)
# Set the processor to use the default system prompt to False as it's used based on input modality.
# Hence set to False to avoid any issues in the test irrespective of inputs.

View File

@ -149,7 +149,7 @@ class VisionTextDualEncoderProcessorTest(ProcessorTesterMixin, unittest.TestCase
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "token_type_ids", "attention_mask"])
# test if it raises when no input is passed
with self.assertRaises(ValueError):