mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
[processors] Unbloating simple processors (#40377)
* modularize processor - step 1 * typos * why raise error, super call check it also * tiny update * fix copies * fix style and test * lost an import / fix copies * fix tests * oops deleted accidentally
This commit is contained in:
committed by
GitHub
parent
c52889bd51
commit
08edec9f7d
@ -16,11 +16,7 @@
|
||||
Image/Text processor class for ALIGN
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class AlignProcessorKwargs(ProcessingKwargs, total=False):
|
||||
@ -66,74 +62,10 @@ class AlignProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "EfficientNetImageProcessor"
|
||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||
valid_processor_kwargs = AlignProcessorKwargs
|
||||
|
||||
def __init__(self, image_processor, tokenizer):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[AlignProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
|
||||
arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` arguments to
|
||||
EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
|
||||
to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`str`, `list[str]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
"""
|
||||
if text is None and images is None:
|
||||
raise ValueError("You must specify either text or images.")
|
||||
|
||||
output_kwargs = self._merge_kwargs(
|
||||
AlignProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
# then, we can pass correct kwargs to each processor
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
|
||||
# BC for explicit return_tensors
|
||||
if "return_tensors" in output_kwargs["common_kwargs"]:
|
||||
return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding["pixel_values"] = image_features.pixel_values
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["AlignProcessor"]
|
||||
|
@ -16,18 +16,10 @@
|
||||
Image/Text processor class for AltCLIP
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...utils.deprecation import deprecate_kwarg
|
||||
|
||||
|
||||
class AltClipProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {}
|
||||
|
||||
|
||||
class AltCLIPProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single
|
||||
@ -49,80 +41,7 @@ class AltCLIPProcessor(ProcessorMixin):
|
||||
|
||||
@deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
|
||||
def __init__(self, image_processor=None, tokenizer=None):
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[AltClipProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
|
||||
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
||||
images (`ImageInput`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
"""
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You must specify either text or images.")
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You must specify either text or images.")
|
||||
output_kwargs = self._merge_kwargs(
|
||||
AltClipProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
|
||||
# BC for explicit return_tensors
|
||||
if "return_tensors" in output_kwargs["common_kwargs"]:
|
||||
return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding["pixel_values"] = image_features.pixel_values
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["AltCLIPProcessor"]
|
||||
|
@ -16,10 +16,7 @@
|
||||
Processor class for BridgeTower.
|
||||
"""
|
||||
|
||||
from typing import Union
|
||||
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
|
||||
@ -60,35 +57,10 @@ class BridgeTowerProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "BridgeTowerImageProcessor"
|
||||
tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
|
||||
valid_processor_kwargs = BridgeTowerProcessorKwargs
|
||||
|
||||
def __init__(self, image_processor, tokenizer):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[BridgeTowerProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and
|
||||
[`RobertaTokenizerFast.__call__`] to prepare text for the model.
|
||||
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
BridgeTowerProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
|
||||
# add pixel_values + pixel_mask
|
||||
encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
encoding.update(encoding_image_processor)
|
||||
|
||||
return encoding
|
||||
|
||||
|
||||
__all__ = ["BridgeTowerProcessor"]
|
||||
|
@ -16,11 +16,22 @@
|
||||
Processor class for Bros.
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||
from ...utils import TensorType
|
||||
|
||||
class BrosProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"add_special_tokens": True,
|
||||
"padding": False,
|
||||
"stride": 0,
|
||||
"return_overflowing_tokens": False,
|
||||
"return_special_tokens_mask": False,
|
||||
"return_offsets_mapping": False,
|
||||
"return_length": False,
|
||||
"verbose": True,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class BrosProcessor(ProcessorMixin):
|
||||
@ -37,6 +48,7 @@ class BrosProcessor(ProcessorMixin):
|
||||
|
||||
attributes = ["tokenizer"]
|
||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||
valid_processor_kwargs = BrosProcessorKwargs
|
||||
|
||||
def __init__(self, tokenizer=None, **kwargs):
|
||||
if tokenizer is None:
|
||||
@ -44,50 +56,5 @@ class BrosProcessor(ProcessorMixin):
|
||||
|
||||
super().__init__(tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
add_special_tokens: bool = True,
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
truncation: Union[bool, str, TruncationStrategy] = None,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_overflowing_tokens: bool = False,
|
||||
return_special_tokens_mask: bool = False,
|
||||
return_offsets_mapping: bool = False,
|
||||
return_length: bool = False,
|
||||
verbose: bool = True,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
**kwargs,
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
This method uses [`BertTokenizerFast.__call__`] to prepare text for the model.
|
||||
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
encoding = self.tokenizer(
|
||||
text=text,
|
||||
add_special_tokens=add_special_tokens,
|
||||
padding=padding,
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
return_special_tokens_mask=return_special_tokens_mask,
|
||||
return_offsets_mapping=return_offsets_mapping,
|
||||
return_length=return_length,
|
||||
verbose=verbose,
|
||||
return_tensors=return_tensors,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return encoding
|
||||
|
||||
|
||||
__all__ = ["BrosProcessor"]
|
||||
|
@ -99,7 +99,7 @@ class ChameleonProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -17,15 +17,8 @@ Image/Text processor class for Chinese-CLIP
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
||||
|
||||
|
||||
class ChineseClipProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {}
|
||||
from ...processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class ChineseCLIPProcessor(ProcessorMixin):
|
||||
@ -58,79 +51,10 @@ class ChineseCLIPProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
images: Optional[ImageInput] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[ChineseClipProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
"""
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
||||
output_kwargs = self._merge_kwargs(
|
||||
ChineseClipProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
|
||||
# BC for explicit return_tensors
|
||||
if "return_tensors" in output_kwargs["common_kwargs"]:
|
||||
return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding["pixel_values"] = image_features.pixel_values
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
@property
|
||||
def feature_extractor_class(self):
|
||||
warnings.warn(
|
||||
|
@ -16,8 +16,16 @@
|
||||
Audio/Text processor class for CLAP
|
||||
"""
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import BatchEncoding
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...audio_utils import AudioInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import logging
|
||||
from ...utils.deprecation import deprecate_kwarg
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ClapProcessor(ProcessorMixin):
|
||||
@ -40,61 +48,28 @@ class ClapProcessor(ProcessorMixin):
|
||||
def __init__(self, feature_extractor, tokenizer):
|
||||
super().__init__(feature_extractor, tokenizer)
|
||||
|
||||
def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
|
||||
@deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
|
||||
def __call__(
|
||||
self,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audios: Optional[AudioInput] = None,
|
||||
audio: Optional[AudioInput] = None,
|
||||
**kwargs: Unpack[ProcessingKwargs],
|
||||
):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
|
||||
and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
|
||||
encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||
ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
|
||||
docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
audios (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
|
||||
of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
|
||||
and T the sample length of the audio.
|
||||
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **audio_features** -- Audio features to be fed to a model. Returned when `audios` is not `None`.
|
||||
Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text`
|
||||
argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more
|
||||
information.
|
||||
"""
|
||||
sampling_rate = kwargs.pop("sampling_rate", None)
|
||||
|
||||
if text is None and audios is None:
|
||||
raise ValueError("You have to specify either text or audios. Both cannot be none.")
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
||||
|
||||
if audios is not None:
|
||||
audio_features = self.feature_extractor(
|
||||
audios, sampling_rate=sampling_rate, return_tensors=return_tensors, **kwargs
|
||||
# The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check
|
||||
# again that the correct naming is used
|
||||
if audios is not None and audio is None:
|
||||
logger.warning(
|
||||
"Using `audios` keyword argument is deprecated when calling ClapProcessor, instead use `audio`."
|
||||
)
|
||||
audio = audios
|
||||
|
||||
if text is not None and audios is not None:
|
||||
encoding.update(audio_features)
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(data=dict(**audio_features), tensor_type=return_tensors)
|
||||
return super().__call__(text=text, audio=audio, **kwargs)
|
||||
|
||||
|
||||
__all__ = ["ClapProcessor"]
|
||||
|
@ -19,7 +19,6 @@ Image/Text processor class for CLIP
|
||||
import warnings
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import BatchEncoding
|
||||
|
||||
|
||||
class CLIPProcessor(ProcessorMixin):
|
||||
@ -51,71 +50,9 @@ class CLIPProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
"""
|
||||
tokenizer_kwargs, image_processor_kwargs = {}, {}
|
||||
if kwargs:
|
||||
tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys}
|
||||
image_processor_kwargs = {
|
||||
k: v for k, v in kwargs.items() if k in self.image_processor._valid_processor_keys
|
||||
}
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, return_tensors=return_tensors, **tokenizer_kwargs)
|
||||
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, return_tensors=return_tensors, **image_processor_kwargs)
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding["pixel_values"] = image_features.pixel_values
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
@property
|
||||
def feature_extractor_class(self):
|
||||
warnings.warn(
|
||||
|
@ -51,10 +51,6 @@ class CLIPSegProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
@ -62,7 +58,7 @@ class CLIPSegProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
|
||||
the above two methods for more information.
|
||||
|
||||
|
@ -18,6 +18,10 @@ Processor class for CLVP
|
||||
"""
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ClvpProcessor(ProcessorMixin):
|
||||
@ -36,11 +40,6 @@ class ClvpProcessor(ProcessorMixin):
|
||||
|
||||
feature_extractor_class = "ClvpFeatureExtractor"
|
||||
tokenizer_class = "ClvpTokenizer"
|
||||
model_input_names = [
|
||||
"input_ids",
|
||||
"input_features",
|
||||
"attention_mask",
|
||||
]
|
||||
|
||||
def __init__(self, feature_extractor, tokenizer):
|
||||
super().__init__(feature_extractor, tokenizer)
|
||||
@ -51,27 +50,13 @@ class ClvpProcessor(ProcessorMixin):
|
||||
argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||
information.
|
||||
"""
|
||||
|
||||
raw_speech = kwargs.pop("raw_speech", None)
|
||||
sampling_rate = kwargs.pop("sampling_rate", None)
|
||||
text = kwargs.pop("text", None)
|
||||
|
||||
if raw_speech is None and text is None:
|
||||
raise ValueError("You need to specify either an `raw_speech` or `text` input to process.")
|
||||
|
||||
if raw_speech is not None:
|
||||
inputs = self.feature_extractor(raw_speech, sampling_rate=sampling_rate, **kwargs)
|
||||
if text is not None:
|
||||
encodings = self.tokenizer(text, **kwargs)
|
||||
|
||||
if text is None:
|
||||
return inputs
|
||||
elif raw_speech is None:
|
||||
return encodings
|
||||
else:
|
||||
inputs["input_ids"] = encodings["input_ids"]
|
||||
inputs["attention_mask"] = encodings["attention_mask"]
|
||||
return inputs
|
||||
logger.warning(
|
||||
"Using `raw_speech` keyword argument is deprecated when calling ClvpProcessor, instead use `audio`."
|
||||
)
|
||||
kwargs["audio"] = raw_speech
|
||||
return super().__call__(*args, **kwargs)
|
||||
|
||||
|
||||
__all__ = ["ClvpProcessor"]
|
||||
|
@ -106,10 +106,6 @@ class ColPaliProcessor(ProcessorMixin):
|
||||
query_prefix: str = "Question: ",
|
||||
):
|
||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
if not hasattr(image_processor, "image_seq_length"):
|
||||
raise ValueError("Image processor is missing an `image_seq_length` attribute.")
|
||||
|
||||
|
@ -249,7 +249,7 @@ class DeepseekVLProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -78,7 +78,7 @@ class DeepseekVLProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -926,7 +926,7 @@ class DeepseekVLHybridProcessor(DeepseekVLProcessor):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -78,7 +78,7 @@ class DeepseekVLHybridProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -65,10 +65,6 @@ class DonutProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
|
@ -102,7 +102,7 @@ class Emu3Processor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -17,12 +17,39 @@ Image/Text processor class for FLAVA
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||
from ...utils import TensorType
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class FlavaImagesKwargs(ImagesKwargs):
|
||||
# Mask related params
|
||||
return_image_mask: Optional[bool]
|
||||
input_size_patches: Optional[int]
|
||||
total_mask_patches: Optional[int]
|
||||
mask_group_min_patches: Optional[int]
|
||||
mask_group_max_patches: Optional[int]
|
||||
mask_group_min_aspect_ratio: Optional[float]
|
||||
mask_group_max_aspect_ratio: Optional[float]
|
||||
# Codebook related params
|
||||
return_codebook_pixels: Optional[bool]
|
||||
codebook_do_resize: Optional[bool]
|
||||
codebook_size: Optional[bool]
|
||||
codebook_resample: Optional[int]
|
||||
codebook_do_center_crop: Optional[bool]
|
||||
codebook_crop_size: Optional[int]
|
||||
codebook_do_rescale: Optional[bool]
|
||||
codebook_rescale_factor: Optional[Union[int, float]]
|
||||
codebook_do_map_pixels: Optional[bool]
|
||||
codebook_do_normalize: Optional[bool]
|
||||
codebook_image_mean: Optional[Union[float, Iterable[float]]]
|
||||
codebook_image_std: Optional[Union[float, Iterable[float]]]
|
||||
|
||||
|
||||
class FlavaProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: FlavaImagesKwargs
|
||||
_defaults = {}
|
||||
|
||||
|
||||
class FlavaProcessor(ProcessorMixin):
|
||||
@ -40,6 +67,7 @@ class FlavaProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "FlavaImageProcessor"
|
||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||
valid_processor_kwargs = FlavaProcessorKwargs
|
||||
|
||||
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||
feature_extractor = None
|
||||
@ -52,82 +80,9 @@ class FlavaProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
add_special_tokens: bool = True,
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_image_mask: Optional[bool] = None,
|
||||
return_codebook_pixels: Optional[bool] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_overflowing_tokens: bool = False,
|
||||
return_special_tokens_mask: bool = False,
|
||||
return_offsets_mapping: bool = False,
|
||||
return_length: bool = False,
|
||||
verbose: bool = True,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
|
||||
[`BertTokenizerFast.__call__`] to prepare text for the model.
|
||||
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(
|
||||
text=text,
|
||||
add_special_tokens=add_special_tokens,
|
||||
padding=padding,
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
return_special_tokens_mask=return_special_tokens_mask,
|
||||
return_offsets_mapping=return_offsets_mapping,
|
||||
return_length=return_length,
|
||||
verbose=verbose,
|
||||
return_tensors=return_tensors,
|
||||
**kwargs,
|
||||
)
|
||||
if images is not None:
|
||||
image_features = self.image_processor(
|
||||
images,
|
||||
return_image_mask=return_image_mask,
|
||||
return_codebook_pixels=return_codebook_pixels,
|
||||
return_tensors=return_tensors,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding.update(image_features)
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
@property
|
||||
def feature_extractor_class(self):
|
||||
warnings.warn(
|
||||
|
@ -357,7 +357,7 @@ class Florence2Processor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -158,7 +158,7 @@ class Florence2Processor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -16,20 +16,7 @@
|
||||
Image/Text processor class for GIT
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
class GitProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {}
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
from ...processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class GitProcessor(ProcessorMixin):
|
||||
@ -54,65 +41,5 @@ class GitProcessor(ProcessorMixin):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[GitProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
"""
|
||||
if text is None and images is None:
|
||||
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
||||
|
||||
output_kwargs = self._merge_kwargs(
|
||||
GitProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
data = {}
|
||||
if text is not None:
|
||||
text_features = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
data.update(text_features)
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
data.update(image_features)
|
||||
|
||||
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
|
||||
|
||||
|
||||
__all__ = ["GitProcessor"]
|
||||
|
@ -144,7 +144,7 @@ class GotOcr2Processor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
|
||||
is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
|
||||
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
|
||||
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
|
||||
GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
|
||||
|
||||
Args:
|
||||
|
@ -20,7 +20,6 @@ import pathlib
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_transforms import center_to_corners_format
|
||||
from ...image_utils import AnnotationFormat, ImageInput
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
@ -144,6 +143,7 @@ class GroundingDinoProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "GroundingDinoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
valid_processor_kwargs = GroundingDinoProcessorKwargs
|
||||
|
||||
def __init__(self, image_processor, tokenizer):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
@ -152,8 +152,6 @@ class GroundingDinoProcessor(ProcessorMixin):
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[GroundingDinoProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
@ -170,33 +168,9 @@ class GroundingDinoProcessor(ProcessorMixin):
|
||||
- A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog.").
|
||||
- A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]).
|
||||
"""
|
||||
if images is None and text is None:
|
||||
raise ValueError("You must specify either text or images.")
|
||||
|
||||
output_kwargs = self._merge_kwargs(
|
||||
GroundingDinoProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Get only text
|
||||
if images is not None:
|
||||
encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
else:
|
||||
encoding_image_processor = BatchFeature()
|
||||
|
||||
if text is not None:
|
||||
text = self._preprocess_input_text(text)
|
||||
text_encoding = self.tokenizer(
|
||||
text=text,
|
||||
**output_kwargs["text_kwargs"],
|
||||
)
|
||||
else:
|
||||
text_encoding = BatchEncoding()
|
||||
|
||||
text_encoding.update(encoding_image_processor)
|
||||
|
||||
return text_encoding
|
||||
return super().__call__(images=images, text=text, **kwargs)
|
||||
|
||||
def _preprocess_input_text(self, text):
|
||||
"""
|
||||
|
@ -214,11 +214,6 @@ class IdeficsProcessor(ProcessorMixin):
|
||||
tokenizer_class = "LlamaTokenizerFast"
|
||||
|
||||
def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
self.image_token_id = (
|
||||
|
@ -90,11 +90,6 @@ class Idefics2Processor(ProcessorMixin):
|
||||
def __init__(
|
||||
self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
|
||||
):
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
if not hasattr(tokenizer, "image_token"):
|
||||
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
|
||||
self.image_token = AddedToken("<image>", normalized=False, special=True).content
|
||||
|
@ -138,11 +138,6 @@ class Idefics3Processor(ProcessorMixin):
|
||||
def __init__(
|
||||
self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
|
||||
):
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
|
||||
self.image_token = AddedToken("<image>", normalized=False, special=True).content
|
||||
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
|
||||
|
@ -165,7 +165,7 @@ class InternVLProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
|
||||
is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
|
||||
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
|
||||
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
|
||||
GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
|
||||
|
||||
Args:
|
||||
|
@ -88,7 +88,7 @@ class JanusProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
JanusImageProcessor's [`~JanusImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -13,10 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ...audio_utils import AudioInput, make_list_of_audio
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
|
||||
@ -38,53 +36,7 @@ class KyutaiSpeechToTextProcessor(ProcessorMixin):
|
||||
|
||||
feature_extractor_class = "KyutaiSpeechToTextFeatureExtractor"
|
||||
tokenizer_class = "PreTrainedTokenizerFast"
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
audio: Optional[AudioInput] = None,
|
||||
**kwargs: Unpack[KyutaiSpeechToTextProcessorKwargs],
|
||||
):
|
||||
r"""
|
||||
Main method to prepare audio to be fed as input to the model. This method forwards the `audio`
|
||||
arguments to KyutaiSpeechToTextFeatureExtractor's [`~KyutaiSpeechToTextFeatureExtractor.__call__`]. Please refer
|
||||
to the docstring of the above method for more information.
|
||||
|
||||
Args:
|
||||
audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
|
||||
tensor.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
|
||||
- **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
|
||||
- **padding_mask** -- List of indices specifying which input values should be ignored by the model.
|
||||
"""
|
||||
|
||||
if audio is None:
|
||||
raise ValueError("`audio` is required.")
|
||||
|
||||
output_kwargs = self._merge_kwargs(
|
||||
KyutaiSpeechToTextProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
audio_kwargs = output_kwargs["audio_kwargs"]
|
||||
|
||||
# ensure audio in correct format
|
||||
audio = make_list_of_audio(audio)
|
||||
|
||||
inputs = self.feature_extractor(
|
||||
audio,
|
||||
**audio_kwargs,
|
||||
)
|
||||
|
||||
return inputs
|
||||
valid_processor_kwargs = KyutaiSpeechToTextProcessorKwargs
|
||||
|
||||
|
||||
__all__ = ["KyutaiSpeechToTextProcessor"]
|
||||
|
@ -59,10 +59,6 @@ class LayoutLMv2Processor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
|
@ -59,10 +59,6 @@ class LayoutLMv3Processor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
|
@ -58,10 +58,6 @@ class LayoutXLMProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
|
@ -101,7 +101,7 @@ class LlavaProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -111,7 +111,7 @@ class LlavaNextProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -123,9 +123,9 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
|
||||
this method forwards the `videos` and `kwrags` arguments to LlavaNextVideoImageProcessor's
|
||||
this method forwards the `videos` and `kwargs` arguments to LlavaNextVideoImageProcessor's
|
||||
[`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -121,7 +121,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -67,10 +67,6 @@ class MgpstrProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
self.char_tokenizer = tokenizer
|
||||
self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
|
||||
|
@ -60,33 +60,9 @@ class MusicgenProcessor(ProcessorMixin):
|
||||
if self._in_target_context_manager:
|
||||
return self.current_processor(*args, **kwargs)
|
||||
|
||||
audio = kwargs.pop("audio", None)
|
||||
sampling_rate = kwargs.pop("sampling_rate", None)
|
||||
text = kwargs.pop("text", None)
|
||||
if len(args) > 0:
|
||||
audio = args[0]
|
||||
args = args[1:]
|
||||
|
||||
if audio is None and text is None:
|
||||
raise ValueError("You need to specify either an `audio` or `text` input to process.")
|
||||
|
||||
if text is not None:
|
||||
inputs = self.tokenizer(text, **kwargs)
|
||||
|
||||
if audio is not None:
|
||||
audio_inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
|
||||
|
||||
if audio is None:
|
||||
return inputs
|
||||
|
||||
elif text is None:
|
||||
return audio_inputs
|
||||
|
||||
else:
|
||||
inputs["input_values"] = audio_inputs["input_values"]
|
||||
if "padding_mask" in audio_inputs:
|
||||
inputs["padding_mask"] = audio_inputs["padding_mask"]
|
||||
return inputs
|
||||
kwargs["audio"] = args[0]
|
||||
return super().__call__(*args, **kwargs)
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
|
@ -51,49 +51,16 @@ class MusicgenMelodyProcessor(ProcessorMixin):
|
||||
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
|
||||
return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
|
||||
|
||||
def __call__(self, audio=None, text=None, **kwargs):
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
|
||||
and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
|
||||
`None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
|
||||
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
|
||||
of a NumPy array/PyTorch tensor, each audio should be a mono-stereo signal of shape (T), where T is the sample length of the audio.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
kwargs (*optional*):
|
||||
Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
|
||||
tokenizer.
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
|
||||
- **attention_mask** -- List of token indices specifying which tokens should be attended to by the model when `text` is not `None`.
|
||||
When only `audio` is specified, returns the timestamps attention mask.
|
||||
Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
|
||||
argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||
information.
|
||||
"""
|
||||
|
||||
sampling_rate = kwargs.pop("sampling_rate", None)
|
||||
|
||||
if audio is None and text is None:
|
||||
raise ValueError("You need to specify either an `audio` or `text` input to process.")
|
||||
|
||||
if text is not None:
|
||||
inputs = self.tokenizer(text, **kwargs)
|
||||
if audio is not None:
|
||||
audio_inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)
|
||||
|
||||
if text is None:
|
||||
return audio_inputs
|
||||
elif audio is None:
|
||||
return inputs
|
||||
else:
|
||||
inputs["input_features"] = audio_inputs["input_features"]
|
||||
return inputs
|
||||
if len(args) > 0:
|
||||
kwargs["audio"] = args[0]
|
||||
return super().__call__(*args, **kwargs)
|
||||
|
||||
# Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor.batch_decode with padding_mask->attention_mask
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
|
@ -48,11 +48,6 @@ class OneFormerProcessor(ProcessorMixin):
|
||||
def __init__(
|
||||
self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
|
||||
):
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
self.max_seq_length = max_seq_length
|
||||
self.task_seq_length = task_seq_length
|
||||
|
||||
|
@ -85,7 +85,7 @@ class Ovis2Processor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
Ovis2ImageProcessor's [`~Ovis2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -86,7 +86,7 @@ class Owlv2Processor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -82,10 +82,6 @@ class OwlViTProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
@ -100,7 +96,7 @@ class OwlViTProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -125,10 +125,6 @@ class PaliGemmaProcessor(ProcessorMixin):
|
||||
chat_template=None,
|
||||
**kwargs,
|
||||
):
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
if not hasattr(image_processor, "image_seq_length"):
|
||||
raise ValueError("Image processor is missing an `image_seq_length` attribute.")
|
||||
|
||||
@ -161,7 +157,7 @@ class PaliGemmaProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -86,7 +86,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
|
||||
and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -127,7 +127,7 @@ class PixtralProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -893,7 +893,7 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
||||
|
||||
Args:
|
||||
|
@ -105,7 +105,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
||||
|
||||
Args:
|
||||
|
@ -87,7 +87,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||
the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
|
||||
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -100,7 +100,7 @@ class Qwen2VLProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
||||
|
||||
Args:
|
||||
|
@ -16,7 +16,26 @@
|
||||
Audio/Text processor class for SeamlessM4T
|
||||
"""
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...audio_utils import AudioInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import logging
|
||||
from ...utils.deprecation import deprecate_kwarg
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class SeamlessM4TTextKwargs(TextKwargs):
|
||||
src_lang: Optional[str]
|
||||
tgt_lang: Optional[str]
|
||||
|
||||
|
||||
class SeamlessM4TProcessorKwargs(ProcessingKwargs, total=False):
|
||||
text_kwargs: SeamlessM4TTextKwargs
|
||||
_defaults = {}
|
||||
|
||||
|
||||
class SeamlessM4TProcessor(ProcessorMixin):
|
||||
@ -37,15 +56,23 @@ class SeamlessM4TProcessor(ProcessorMixin):
|
||||
|
||||
feature_extractor_class = "SeamlessM4TFeatureExtractor"
|
||||
tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast")
|
||||
valid_processor_kwargs = SeamlessM4TProcessorKwargs
|
||||
|
||||
def __init__(self, feature_extractor, tokenizer):
|
||||
super().__init__(feature_extractor, tokenizer)
|
||||
|
||||
def __call__(self, text=None, audios=None, src_lang=None, tgt_lang=None, **kwargs):
|
||||
@deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
|
||||
def __call__(
|
||||
self,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
audios: Optional[AudioInput] = None,
|
||||
audio: Optional[AudioInput] = None,
|
||||
**kwargs: Unpack[ProcessingKwargs],
|
||||
):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
|
||||
and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
|
||||
`None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||
`None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
|
||||
SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
|
||||
to the docstring of the above two methods for more information.
|
||||
|
||||
@ -58,14 +85,6 @@ class SeamlessM4TProcessor(ProcessorMixin):
|
||||
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
|
||||
of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
|
||||
and T the sample length of the audio.
|
||||
src_lang (`str`, *optional*):
|
||||
The language code of the input texts/audios. If not specified, the last `src_lang` specified will be
|
||||
used.
|
||||
tgt_lang (`str`, *optional*):
|
||||
The code of the target language. If not specified, the last `tgt_lang` specified will be used.
|
||||
kwargs (*optional*):
|
||||
Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
|
||||
tokenizer.
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
@ -75,26 +94,16 @@ class SeamlessM4TProcessor(ProcessorMixin):
|
||||
`None`).
|
||||
- **input_features** -- Audio input features to be fed to a model. Returned when `audios` is not `None`.
|
||||
"""
|
||||
sampling_rate = kwargs.pop("sampling_rate", None)
|
||||
|
||||
if text is None and audios is None:
|
||||
raise ValueError("You have to specify either text or audios. Both cannot be none.")
|
||||
elif text is not None and audios is not None:
|
||||
if text is not None and audios is not None:
|
||||
raise ValueError(
|
||||
"Text and audios are mututally exclusive when passed to `SeamlessM4T`. Specify one or another."
|
||||
)
|
||||
elif text is not None:
|
||||
if tgt_lang is not None:
|
||||
self.tokenizer.tgt_lang = tgt_lang
|
||||
if src_lang is not None:
|
||||
self.tokenizer.src_lang = src_lang
|
||||
encoding = self.tokenizer(text, **kwargs)
|
||||
|
||||
return encoding
|
||||
|
||||
else:
|
||||
encoding = self.feature_extractor(audios, sampling_rate=sampling_rate, **kwargs)
|
||||
return encoding
|
||||
if audio is None and audios is not None:
|
||||
logger.warning(
|
||||
"Passing `audios` as keyword argument is deprecated and will be removed in v4.63, please pass `audio` instead."
|
||||
)
|
||||
audio = audios
|
||||
return super().__call__(text=text, audio=audio, **kwargs)
|
||||
|
||||
|
||||
__all__ = ["SeamlessM4TProcessor"]
|
||||
|
@ -16,13 +16,7 @@
|
||||
Image/Text processor class for SigLIP.
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||
from ...utils import TensorType
|
||||
|
||||
|
||||
class SiglipProcessor(ProcessorMixin):
|
||||
@ -46,79 +40,5 @@ class SiglipProcessor(ProcessorMixin):
|
||||
def __init__(self, image_processor, tokenizer):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
images: Optional[ImageInput] = None,
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
truncation: Union[bool, str, TruncationStrategy] = None,
|
||||
max_length: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` argument to
|
||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||
index) among:
|
||||
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
||||
sequence if provided).
|
||||
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
||||
acceptable input length for the model if that argument is not provided.
|
||||
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
|
||||
lengths).
|
||||
max_length (`int`, *optional*):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
truncation (`bool`, *optional*):
|
||||
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
"""
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(
|
||||
text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
|
||||
)
|
||||
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, return_tensors=return_tensors)
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding.update(image_features)
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["SiglipProcessor"]
|
||||
|
@ -16,12 +16,9 @@
|
||||
Image/Text processor class for SigLIP2.
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class Siglip2ImagesKwargs(ImagesKwargs, total=False):
|
||||
@ -63,89 +60,10 @@ class Siglip2Processor(ProcessorMixin):
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
valid_processor_kwargs = Siglip2ProcessorKwargs
|
||||
|
||||
def __init__(self, image_processor, tokenizer):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[Union[ImageInput, list[ImageInput], list[list[ImageInput]]]] = None,
|
||||
text: Optional[Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Siglip2ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` argument to
|
||||
Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||
index) among:
|
||||
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
||||
acceptable input length for the model if that argument is not provided.
|
||||
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
||||
sequence if provided).
|
||||
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
|
||||
lengths).
|
||||
max_length (`int`, *optional*, defaults to 64):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
truncation (`bool`, *optional*, defaults to `True`):
|
||||
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'pt'`):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
- **pixel_attention_mask** -- Attention mask for the pixel values. Returned when `images` is not `None`.
|
||||
- **spatial_shapes** -- The number of horizontal and vertical patches per image.
|
||||
Returned when `images` is not `None`.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
Siglip2ProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding.update(image_features)
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
|
||||
return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["Siglip2Processor"]
|
||||
|
@ -60,10 +60,6 @@ class TrOCRProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
|
@ -16,8 +16,18 @@
|
||||
Processor class for TVP.
|
||||
"""
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import BatchEncoding
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class TvpProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"truncation": True,
|
||||
"padding": "max_length",
|
||||
"pad_to_max_length": True,
|
||||
"return_token_type_ids": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class TvpProcessor(ProcessorMixin):
|
||||
@ -39,74 +49,8 @@ class TvpProcessor(ProcessorMixin):
|
||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||
|
||||
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
||||
TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring of
|
||||
the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, `list[list[PIL.Image.Image]]`, `list[list[np.ndarray]]`,:
|
||||
`list[list[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
||||
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
||||
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
||||
channels.
|
||||
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
|
||||
"""
|
||||
|
||||
max_text_length = kwargs.pop("max_text_length", None)
|
||||
|
||||
if text is None and videos is None:
|
||||
raise ValueError("You have to specify either text or videos. Both cannot be none.")
|
||||
|
||||
encoding = {}
|
||||
if text is not None:
|
||||
textual_input = self.tokenizer.batch_encode_plus(
|
||||
text,
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
max_length=max_text_length,
|
||||
pad_to_max_length=True,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=False,
|
||||
**kwargs,
|
||||
)
|
||||
encoding.update(textual_input)
|
||||
|
||||
if videos is not None:
|
||||
image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
|
||||
encoding.update(image_features)
|
||||
|
||||
return BatchEncoding(data=encoding, tensor_type=return_tensors)
|
||||
self.video_processor = image_processor
|
||||
|
||||
def post_process_video_grounding(self, logits, video_durations):
|
||||
"""
|
||||
|
@ -100,7 +100,7 @@ class VideoLlavaProcessor(ProcessorMixin):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
|
@ -17,11 +17,23 @@ Processor class for ViLT.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||
from ...utils import TensorType
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class ViltProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"add_special_tokens": True,
|
||||
"padding": False,
|
||||
"stride": 0,
|
||||
"return_overflowing_tokens": False,
|
||||
"return_special_tokens_mask": False,
|
||||
"return_offsets_mapping": False,
|
||||
"return_length": False,
|
||||
"verbose": True,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class ViltProcessor(ProcessorMixin):
|
||||
@ -41,6 +53,7 @@ class ViltProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "ViltImageProcessor"
|
||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||
valid_processor_kwargs = ViltProcessorKwargs
|
||||
|
||||
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||
feature_extractor = None
|
||||
@ -53,64 +66,9 @@ class ViltProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
add_special_tokens: bool = True,
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
truncation: Union[bool, str, TruncationStrategy] = None,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_overflowing_tokens: bool = False,
|
||||
return_special_tokens_mask: bool = False,
|
||||
return_offsets_mapping: bool = False,
|
||||
return_length: bool = False,
|
||||
verbose: bool = True,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
**kwargs,
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
|
||||
[`BertTokenizerFast.__call__`] to prepare text for the model.
|
||||
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
encoding = self.tokenizer(
|
||||
text=text,
|
||||
add_special_tokens=add_special_tokens,
|
||||
padding=padding,
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
return_special_tokens_mask=return_special_tokens_mask,
|
||||
return_offsets_mapping=return_offsets_mapping,
|
||||
return_length=return_length,
|
||||
verbose=verbose,
|
||||
return_tensors=return_tensors,
|
||||
**kwargs,
|
||||
)
|
||||
# add pixel_values + pixel_mask
|
||||
encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
|
||||
encoding.update(encoding_image_processor)
|
||||
|
||||
return encoding
|
||||
|
||||
@property
|
||||
def feature_extractor_class(self):
|
||||
warnings.warn(
|
||||
|
@ -17,11 +17,8 @@ Processor class for VisionTextDualEncoder
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin
|
||||
|
||||
|
||||
class VisionTextDualEncoderProcessorKwargs(ProcessingKwargs, total=False):
|
||||
@ -59,82 +56,9 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You have to specify an image_processor.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You have to specify a tokenizer.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.current_processor = self.image_processor
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[VisionTextDualEncoderProcessorKwargs],
|
||||
) -> BatchEncoding:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
|
||||
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
"""
|
||||
|
||||
if text is None and images is None:
|
||||
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
||||
|
||||
output_kwargs = self._merge_kwargs(
|
||||
VisionTextDualEncoderProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
|
||||
if images is not None:
|
||||
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
|
||||
if text is not None and images is not None:
|
||||
encoding["pixel_values"] = image_features.pixel_values
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(
|
||||
data=dict(**image_features),
|
||||
tensor_type=output_kwargs["common_kwargs"].get("return_tensors"),
|
||||
)
|
||||
|
||||
@property
|
||||
def feature_extractor_class(self):
|
||||
warnings.warn(
|
||||
|
@ -19,7 +19,6 @@ Image/Text processor class for XCLIP
|
||||
import warnings
|
||||
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils_base import BatchEncoding
|
||||
|
||||
|
||||
class XCLIPProcessor(ProcessorMixin):
|
||||
@ -51,68 +50,10 @@ class XCLIPProcessor(ProcessorMixin):
|
||||
feature_extractor = kwargs.pop("feature_extractor")
|
||||
|
||||
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||
if image_processor is None:
|
||||
raise ValueError("You need to specify an `image_processor`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(image_processor, tokenizer)
|
||||
self.video_processor = self.image_processor
|
||||
self.current_processor = self.image_processor
|
||||
|
||||
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
||||
VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
|
||||
docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, `list[list[PIL.Image.Image]]`, `list[list[np.ndarray]]`,:
|
||||
`list[list[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
||||
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
||||
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
||||
channels.
|
||||
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
|
||||
"""
|
||||
|
||||
if text is None and videos is None:
|
||||
raise ValueError("You have to specify either text or videos. Both cannot be none.")
|
||||
|
||||
if text is not None:
|
||||
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
||||
|
||||
if videos is not None:
|
||||
image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
|
||||
|
||||
if text is not None and videos is not None:
|
||||
encoding["pixel_values"] = image_features.pixel_values
|
||||
return encoding
|
||||
elif text is not None:
|
||||
return encoding
|
||||
else:
|
||||
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
||||
|
||||
@property
|
||||
def feature_extractor_class(self):
|
||||
warnings.warn(
|
||||
|
@ -31,12 +31,12 @@ import numpy as np
|
||||
import typing_extensions
|
||||
from huggingface_hub.errors import EntryNotFoundError
|
||||
|
||||
from .audio_utils import load_audio
|
||||
from .audio_utils import AudioInput, load_audio
|
||||
from .dynamic_module_utils import custom_object_save
|
||||
from .feature_extraction_utils import BatchFeature
|
||||
from .image_utils import ChannelDimension, is_vision_available
|
||||
from .image_utils import ChannelDimension, ImageInput, is_vision_available
|
||||
from .utils.chat_template_utils import render_jinja_template
|
||||
from .video_utils import VideoMetadata
|
||||
from .video_utils import VideoInput, VideoMetadata
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
@ -335,7 +335,8 @@ class CommonKwargs(TypedDict, total=False):
|
||||
class ProcessingKwargs(TypedDict, total=False):
|
||||
"""
|
||||
Base class for kwargs passing to processors.
|
||||
A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
|
||||
In case a model has specific kwargs that are not present in the base class or default values for existing keys,
|
||||
it should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
|
||||
1) Additional typed keys and that this model requires to process inputs.
|
||||
2) Default values for existing keys under a `_defaults` attribute.
|
||||
New keys have to be defined as follows to ensure type hinting is done correctly.
|
||||
@ -370,6 +371,8 @@ class ProcessingKwargs(TypedDict, total=False):
|
||||
|
||||
"""
|
||||
|
||||
_defaults = {}
|
||||
|
||||
common_kwargs: CommonKwargs = {
|
||||
**CommonKwargs.__annotations__,
|
||||
}
|
||||
@ -499,6 +502,7 @@ class ProcessorMixin(PushToHubMixin):
|
||||
feature_extractor_class = None
|
||||
tokenizer_class = None
|
||||
_auto_class = None
|
||||
valid_processor_kwargs = ProcessingKwargs
|
||||
|
||||
# args have to match the attributes class attribute
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -539,6 +543,68 @@ class ProcessorMixin(PushToHubMixin):
|
||||
self.check_argument_for_proper_class(attribute_name, arg)
|
||||
setattr(self, attribute_name, arg)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: Optional[ImageInput] = None,
|
||||
text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
|
||||
videos: Optional[VideoInput] = None,
|
||||
audio: Optional[AudioInput] = None,
|
||||
**kwargs: Unpack[ProcessingKwargs],
|
||||
):
|
||||
"""
|
||||
Main method to prepare for model inputs. This method forwards the each modality argument to its own processor
|
||||
along with `kwargs`. Please refer to the docstring of the each processor attributes for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||
The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
||||
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
||||
audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
|
||||
tensor.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] object with processed inputs in a dict format.
|
||||
"""
|
||||
if images is None and text is None and videos is None and audio is None:
|
||||
raise ValueError(f"You need to provide at least one input to call {self.__class__.__name__}")
|
||||
|
||||
kwargs = self._merge_kwargs(
|
||||
self.valid_processor_kwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs if hasattr(self, "tokenizer") else {},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
attribute_to_kwargs = {
|
||||
"tokenizer": (text, "text_kwargs"),
|
||||
"image_processor": (images, "images_kwargs"),
|
||||
"video_processor": (videos, "videos_kwargs"),
|
||||
"feature_extractor": (audio, "audio_kwargs"),
|
||||
}
|
||||
outputs = {}
|
||||
for attribute_name in self.attributes:
|
||||
attribute = getattr(self, attribute_name, None)
|
||||
input_data, input_kwargs = attribute_to_kwargs[attribute_name]
|
||||
if input_data is not None and attribute is not None:
|
||||
attribute_output = attribute(input_data, **kwargs[input_kwargs])
|
||||
outputs.update(attribute_output)
|
||||
|
||||
return BatchFeature(outputs)
|
||||
|
||||
def check_argument_for_proper_class(self, argument_name, argument):
|
||||
"""
|
||||
Checks the passed argument's class against the expected transformers class. In case of an unexpected
|
||||
|
@ -184,7 +184,7 @@ class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
|
||||
self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
|
||||
self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"})
|
||||
|
||||
# test if it raises when no input is passed
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -186,21 +186,21 @@ class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
|
||||
self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
|
||||
self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"})
|
||||
|
||||
# add extra args
|
||||
inputs = processor(text=input_str, images=image_input, return_codebook_pixels=True, return_image_mask=True)
|
||||
|
||||
self.assertListEqual(
|
||||
list(inputs.keys()),
|
||||
[
|
||||
self.assertSetEqual(
|
||||
set(inputs.keys()),
|
||||
{
|
||||
"input_ids",
|
||||
"token_type_ids",
|
||||
"attention_mask",
|
||||
"pixel_values",
|
||||
"codebook_pixel_values",
|
||||
"bool_masked_pos",
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
# test if it raises when no input is passed
|
||||
|
@ -112,7 +112,7 @@ class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
|
||||
self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
|
||||
self.assertSetEqual(set(inputs.keys()), {"input_ids", "attention_mask", "pixel_values"})
|
||||
|
||||
# test if it raises when no input is passed
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -236,8 +236,8 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
|
||||
self.assertListEqual(
|
||||
list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"]
|
||||
self.assertSetEqual(
|
||||
set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"}
|
||||
)
|
||||
|
||||
# test if it raises when no input is passed
|
||||
|
@ -38,6 +38,7 @@ class JanusProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor = self.processor_class.from_pretrained(
|
||||
"deepseek-community/Janus-Pro-1B",
|
||||
extra_special_tokens=special_image_tokens,
|
||||
**self.prepare_processor_dict(),
|
||||
)
|
||||
# Set the processor to use the default system prompt to False as it's used based on input modality.
|
||||
# Hence set to False to avoid any issues in the test irrespective of inputs.
|
||||
|
@ -149,7 +149,7 @@ class VisionTextDualEncoderProcessorTest(ProcessorTesterMixin, unittest.TestCase
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
|
||||
self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
|
||||
self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "token_type_ids", "attention_mask"])
|
||||
|
||||
# test if it raises when no input is passed
|
||||
with self.assertRaises(ValueError):
|
||||
|
Reference in New Issue
Block a user