mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
Validate processing kwargs with @strict from huggingface_hub (#40793)
* initial design draft * delete * fix a few tests * fix * fix the rest of tests * common-kwargs * why the runner complains about typing with "|"? * revert * forgot to delete * update * fix last issues * add more detalis in docs * pin the latest hub release * fix tests for new models * also fast image processor * fix copies * image processing ast validated * fix more tests * typo.and fix copies * bump * style * fix some tests * fix copies * pin rc4 and mark all TypedDict as non-total * delete typed dict adaptor * address comments * delete optionals
This commit is contained in:
committed by
GitHub
parent
82ffeb28ad
commit
89a4115a6b
2
setup.py
2
setup.py
@ -114,7 +114,7 @@ _deps = [
|
|||||||
"GitPython<3.1.19",
|
"GitPython<3.1.19",
|
||||||
"hf-doc-builder>=0.3.0",
|
"hf-doc-builder>=0.3.0",
|
||||||
"hf_xet",
|
"hf_xet",
|
||||||
"huggingface-hub==1.0.0.rc2",
|
"huggingface-hub==1.0.0.rc4",
|
||||||
"importlib_metadata",
|
"importlib_metadata",
|
||||||
"ipadic>=1.0.0,<2.0",
|
"ipadic>=1.0.0,<2.0",
|
||||||
"jinja2>=3.1.0",
|
"jinja2>=3.1.0",
|
||||||
|
@ -23,7 +23,7 @@ deps = {
|
|||||||
"GitPython": "GitPython<3.1.19",
|
"GitPython": "GitPython<3.1.19",
|
||||||
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
||||||
"hf_xet": "hf_xet",
|
"hf_xet": "hf_xet",
|
||||||
"huggingface-hub": "huggingface-hub==1.0.0.rc2",
|
"huggingface-hub": "huggingface-hub==1.0.0.rc4",
|
||||||
"importlib_metadata": "importlib_metadata",
|
"importlib_metadata": "importlib_metadata",
|
||||||
"ipadic": "ipadic>=1.0.0,<2.0",
|
"ipadic": "ipadic>=1.0.0,<2.0",
|
||||||
"jinja2": "jinja2>=3.1.0",
|
"jinja2": "jinja2>=3.1.0",
|
||||||
|
@ -18,6 +18,7 @@ from functools import lru_cache, partial
|
|||||||
from typing import Any, Optional, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from huggingface_hub.dataclasses import validate_typed_dict
|
||||||
|
|
||||||
from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
||||||
from .image_transforms import (
|
from .image_transforms import (
|
||||||
@ -710,6 +711,10 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
|||||||
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
|
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
|
||||||
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
|
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
|
||||||
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
|
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
|
||||||
|
|
||||||
|
# Perform type validation on received kwargs
|
||||||
|
validate_typed_dict(self.valid_kwargs, kwargs)
|
||||||
|
|
||||||
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
||||||
# by the user, it gets its default value from the instance, or is set to None.
|
# by the user, it gets its default value from the instance, or is set to None.
|
||||||
for kwarg_name in self._valid_kwargs_names:
|
for kwarg_name in self._valid_kwargs_names:
|
||||||
|
@ -38,7 +38,7 @@ from ...image_utils import (
|
|||||||
)
|
)
|
||||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_utils import PreTrainedModel
|
from ...modeling_utils import PreTrainedModel
|
||||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||||
from ...tokenization_utils import PreTokenizedInput, TextInput
|
from ...tokenization_utils import PreTokenizedInput, TextInput
|
||||||
from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging
|
from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging
|
||||||
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
|
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
|
||||||
@ -904,7 +904,15 @@ class AriaImageProcessor(BaseImageProcessor):
|
|||||||
return num_patches
|
return num_patches
|
||||||
|
|
||||||
|
|
||||||
|
class AriaImagesKwargs(ImagesKwargs, total=False):
|
||||||
|
split_image: bool
|
||||||
|
max_image_size: int
|
||||||
|
min_image_size: int
|
||||||
|
|
||||||
|
|
||||||
class AriaProcessorKwargs(ProcessingKwargs, total=False):
|
class AriaProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
images_kwargs: AriaImagesKwargs
|
||||||
|
|
||||||
_defaults = {
|
_defaults = {
|
||||||
"text_kwargs": {
|
"text_kwargs": {
|
||||||
"padding": False,
|
"padding": False,
|
||||||
|
@ -24,13 +24,21 @@ import numpy as np
|
|||||||
|
|
||||||
from ...image_processing_utils import BatchFeature
|
from ...image_processing_utils import BatchFeature
|
||||||
from ...image_utils import ImageInput
|
from ...image_utils import ImageInput
|
||||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||||
from ...tokenization_utils import PreTokenizedInput, TextInput
|
from ...tokenization_utils import PreTokenizedInput, TextInput
|
||||||
from ...utils import TensorType
|
from ...utils import TensorType
|
||||||
from ..auto import AutoTokenizer
|
from ..auto import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class AriaImagesKwargs(ImagesKwargs, total=False):
|
||||||
|
split_image: bool
|
||||||
|
max_image_size: int
|
||||||
|
min_image_size: int
|
||||||
|
|
||||||
|
|
||||||
class AriaProcessorKwargs(ProcessingKwargs, total=False):
|
class AriaProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
images_kwargs: AriaImagesKwargs
|
||||||
|
|
||||||
_defaults = {
|
_defaults = {
|
||||||
"text_kwargs": {
|
"text_kwargs": {
|
||||||
"padding": False,
|
"padding": False,
|
||||||
|
@ -55,7 +55,7 @@ if is_torch_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BeitImageProcessorKwargs(ImagesKwargs):
|
class BeitImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||||
@ -63,7 +63,7 @@ class BeitImageProcessorKwargs(ImagesKwargs):
|
|||||||
ADE20k). The background label will be replaced by 255.
|
ADE20k). The background label will be replaced by 255.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
|
|
||||||
|
|
||||||
@requires(backends=("vision",))
|
@requires(backends=("vision",))
|
||||||
|
@ -123,8 +123,8 @@ def get_resize_output_image_size(
|
|||||||
return new_height, new_width
|
return new_height, new_width
|
||||||
|
|
||||||
|
|
||||||
class BridgeTowerImageProcessorKwargs(ImagesKwargs):
|
class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
|
|
||||||
|
|
||||||
class BridgeTowerImageProcessor(BaseImageProcessor):
|
class BridgeTowerImageProcessor(BaseImageProcessor):
|
||||||
|
@ -33,7 +33,7 @@ from ...processing_utils import ImagesKwargs, Unpack
|
|||||||
from ...utils import TensorType, auto_docstring
|
from ...utils import TensorType, auto_docstring
|
||||||
|
|
||||||
|
|
||||||
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
|
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||||
@ -46,9 +46,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
|
|||||||
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
crop_to_patches: Optional[bool]
|
crop_to_patches: bool
|
||||||
min_patches: Optional[int]
|
min_patches: int
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=10)
|
@lru_cache(maxsize=10)
|
||||||
|
@ -303,7 +303,7 @@ def get_optimal_tiled_canvas(
|
|||||||
return best_grid
|
return best_grid
|
||||||
|
|
||||||
|
|
||||||
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
|
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||||
@ -316,9 +316,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
|
|||||||
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
crop_to_patches: Optional[bool]
|
crop_to_patches: bool
|
||||||
min_patches: Optional[int]
|
min_patches: int
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
|
@ -729,7 +729,7 @@ def compute_segments(
|
|||||||
return segmentation, segments
|
return segmentation, segments
|
||||||
|
|
||||||
|
|
||||||
class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
|
class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
@ -745,9 +745,9 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
|
|||||||
Path to the directory containing the segmentation masks.
|
Path to the directory containing the segmentation masks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Union[str, AnnotationFormat]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: bool
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: bool
|
||||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
masks_path: Optional[Union[str, pathlib.Path]]
|
||||||
|
|
||||||
|
@ -50,14 +50,14 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ConvNextImageProcessorKwargs(ImagesKwargs):
|
class ConvNextImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
crop_pct (`float`, *optional*):
|
crop_pct (`float`, *optional*):
|
||||||
Percentage of the image to crop. Only has an effect if size < 384. Can be
|
Percentage of the image to crop. Only has an effect if size < 384. Can be
|
||||||
overridden by `crop_pct` in the`preprocess` method.
|
overridden by `crop_pct` in the`preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
crop_pct: Optional[float]
|
crop_pct: float
|
||||||
|
|
||||||
|
|
||||||
@requires(backends=("vision",))
|
@requires(backends=("vision",))
|
||||||
|
@ -49,7 +49,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DeepseekVLImageProcessorKwargs(ImagesKwargs):
|
class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
min_size (`int`, *optional*, defaults to 14):
|
min_size (`int`, *optional*, defaults to 14):
|
||||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||||
|
@ -50,7 +50,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
|
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
min_size (`int`, *optional*, defaults to 14):
|
min_size (`int`, *optional*, defaults to 14):
|
||||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||||
@ -71,9 +71,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
|
|||||||
|
|
||||||
min_size: int
|
min_size: int
|
||||||
high_res_size: dict
|
high_res_size: dict
|
||||||
high_res_resample: "PILImageResampling"
|
high_res_resample: Union["PILImageResampling", int]
|
||||||
high_res_image_mean: list[float]
|
high_res_image_mean: Union[float, list[float], tuple[float, ...]]
|
||||||
high_res_image_std: list[float]
|
high_res_image_std: Union[float, list[float], tuple[float, ...]]
|
||||||
|
|
||||||
|
|
||||||
class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||||
|
@ -429,7 +429,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
|
|||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
|
||||||
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
|
class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
min_size (`int`, *optional*, defaults to 14):
|
min_size (`int`, *optional*, defaults to 14):
|
||||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||||
@ -450,9 +450,9 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
|
|||||||
|
|
||||||
min_size: int
|
min_size: int
|
||||||
high_res_size: dict
|
high_res_size: dict
|
||||||
high_res_resample: "PILImageResampling"
|
high_res_resample: Union["PILImageResampling", int]
|
||||||
high_res_image_mean: list[float]
|
high_res_image_mean: Union[float, list[float], tuple[float, ...]]
|
||||||
high_res_image_std: list[float]
|
high_res_image_std: Union[float, list[float], tuple[float, ...]]
|
||||||
|
|
||||||
|
|
||||||
class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
||||||
|
@ -82,7 +82,7 @@ if is_scipy_available():
|
|||||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
|
||||||
class DeformableDetrImageProcessorKwargs(ImagesKwargs):
|
class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
@ -98,9 +98,9 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs):
|
|||||||
Path to the directory containing the segmentation masks.
|
Path to the directory containing the segmentation masks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Union[str, AnnotationFormat]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: bool
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: bool
|
||||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
masks_path: Optional[Union[str, pathlib.Path]]
|
||||||
|
|
||||||
|
@ -84,7 +84,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
|||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
|
|
||||||
|
|
||||||
class DetrImageProcessorKwargs(ImagesKwargs):
|
class DetrImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
@ -100,9 +100,9 @@ class DetrImageProcessorKwargs(ImagesKwargs):
|
|||||||
Path to the directory containing the segmentation masks.
|
Path to the directory containing the segmentation masks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Union[str, AnnotationFormat]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: bool
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: bool
|
||||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
masks_path: Optional[Union[str, pathlib.Path]]
|
||||||
|
|
||||||
|
@ -55,7 +55,9 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
|
|||||||
"generation": True,
|
"generation": True,
|
||||||
"sampling_rate": 44100,
|
"sampling_rate": 44100,
|
||||||
},
|
},
|
||||||
"common_kwargs": {"return_tensors": "pt"},
|
"common_kwargs": {
|
||||||
|
"return_tensors": "pt",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ if is_vision_available():
|
|||||||
import PIL
|
import PIL
|
||||||
|
|
||||||
|
|
||||||
class DonutImageProcessorKwargs(ImagesKwargs):
|
class DonutImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
|
do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
|
||||||
Whether to resize the image using thumbnail method.
|
Whether to resize the image using thumbnail method.
|
||||||
@ -60,8 +60,8 @@ class DonutImageProcessorKwargs(ImagesKwargs):
|
|||||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_thumbnail: Optional[bool]
|
do_thumbnail: bool
|
||||||
do_align_long_axis: Optional[bool]
|
do_align_long_axis: bool
|
||||||
|
|
||||||
|
|
||||||
@requires(backends=("vision",))
|
@requires(backends=("vision",))
|
||||||
|
@ -64,7 +64,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DPTImageProcessorKwargs(ImagesKwargs):
|
class DPTImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
ensure_multiple_of (`int`, *optional*, defaults to 1):
|
ensure_multiple_of (`int`, *optional*, defaults to 1):
|
||||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
|
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
|
||||||
@ -78,10 +78,10 @@ class DPTImageProcessorKwargs(ImagesKwargs):
|
|||||||
ADE20k). The background label will be replaced by 255.
|
ADE20k). The background label will be replaced by 255.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ensure_multiple_of: Optional[int]
|
ensure_multiple_of: int
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
keep_aspect_ratio: Optional[bool]
|
keep_aspect_ratio: bool
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
|
|
||||||
|
|
||||||
def get_resize_output_image_size(
|
def get_resize_output_image_size(
|
||||||
|
@ -50,13 +50,13 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
|
class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
do_grayscale (`bool`, *optional*, defaults to `True`):
|
do_grayscale (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_grayscale: Optional[bool] = True
|
do_grayscale: bool
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
|
# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
|
||||||
|
@ -44,7 +44,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EfficientNetImageProcessorKwargs(ImagesKwargs):
|
class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
|
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
|
||||||
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
|
Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
|
||||||
|
@ -47,9 +47,9 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Emu3ImageProcessorKwargs(ImagesKwargs):
|
class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
ratio: Optional[str]
|
ratio: str
|
||||||
image_area: Optional[int]
|
image_area: int
|
||||||
|
|
||||||
|
|
||||||
def smart_resize(
|
def smart_resize(
|
||||||
|
@ -55,7 +55,7 @@ if is_torch_available():
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
class EomtImageProcessorKwargs(ImagesKwargs):
|
class EomtImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_split_image (`bool`, *optional*, defaults to `False`):
|
do_split_image (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
|
Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
|
||||||
@ -67,7 +67,7 @@ class EomtImageProcessorKwargs(ImagesKwargs):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
do_split_image: bool
|
do_split_image: bool
|
||||||
ignore_index: Optional[int] = None
|
ignore_index: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
|
# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
|
||||||
|
@ -57,7 +57,7 @@ FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
|
|||||||
LOGIT_LAPLACE_EPS: float = 0.1
|
LOGIT_LAPLACE_EPS: float = 0.1
|
||||||
|
|
||||||
|
|
||||||
class FlavaImageProcessorKwargs(ImagesKwargs):
|
class FlavaImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
return_image_mask (`bool`, *optional*, defaults to `False`):
|
return_image_mask (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
|
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
|
||||||
@ -118,26 +118,26 @@ class FlavaImageProcessorKwargs(ImagesKwargs):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Mask related params
|
# Mask related params
|
||||||
return_image_mask: Optional[bool]
|
return_image_mask: bool
|
||||||
input_size_patches: Optional[int]
|
input_size_patches: int
|
||||||
total_mask_patches: Optional[int]
|
total_mask_patches: int
|
||||||
mask_group_min_patches: Optional[int]
|
mask_group_min_patches: int
|
||||||
mask_group_max_patches: Optional[int]
|
mask_group_max_patches: int
|
||||||
mask_group_min_aspect_ratio: Optional[float]
|
mask_group_min_aspect_ratio: float
|
||||||
mask_group_max_aspect_ratio: Optional[float]
|
mask_group_max_aspect_ratio: float
|
||||||
# Codebook related params
|
# Codebook related params
|
||||||
return_codebook_pixels: Optional[bool]
|
return_codebook_pixels: bool
|
||||||
codebook_do_resize: Optional[bool]
|
codebook_do_resize: bool
|
||||||
codebook_size: Optional[bool]
|
codebook_size: bool
|
||||||
codebook_resample: Optional[int]
|
codebook_resample: int
|
||||||
codebook_do_center_crop: Optional[bool]
|
codebook_do_center_crop: bool
|
||||||
codebook_crop_size: Optional[int]
|
codebook_crop_size: int
|
||||||
codebook_do_rescale: Optional[bool]
|
codebook_do_rescale: bool
|
||||||
codebook_rescale_factor: Optional[Union[int, float]]
|
codebook_rescale_factor: Union[int, float]
|
||||||
codebook_do_map_pixels: Optional[bool]
|
codebook_do_map_pixels: bool
|
||||||
codebook_do_normalize: Optional[bool]
|
codebook_do_normalize: bool
|
||||||
codebook_image_mean: Optional[Union[float, Iterable[float]]]
|
codebook_image_mean: Union[float, Iterable[float]]
|
||||||
codebook_image_std: Optional[Union[float, Iterable[float]]]
|
codebook_image_std: Union[float, Iterable[float]]
|
||||||
|
|
||||||
|
|
||||||
# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
|
# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
|
||||||
|
@ -51,7 +51,7 @@ if is_vision_available():
|
|||||||
import PIL
|
import PIL
|
||||||
|
|
||||||
|
|
||||||
class Gemma3ImageProcessorKwargs(ImagesKwargs):
|
class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_pan_and_scan (`bool`, *optional*):
|
do_pan_and_scan (`bool`, *optional*):
|
||||||
Whether to apply `pan_and_scan` to images.
|
Whether to apply `pan_and_scan` to images.
|
||||||
@ -63,10 +63,10 @@ class Gemma3ImageProcessorKwargs(ImagesKwargs):
|
|||||||
Minimum aspect ratio to activate pan and scan.
|
Minimum aspect ratio to activate pan and scan.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_pan_and_scan: Optional[bool]
|
do_pan_and_scan: bool
|
||||||
pan_and_scan_min_crop_size: Optional[int]
|
pan_and_scan_min_crop_size: int
|
||||||
pan_and_scan_max_num_crops: Optional[int]
|
pan_and_scan_max_num_crops: int
|
||||||
pan_and_scan_min_ratio_to_activate: Optional[float]
|
pan_and_scan_min_ratio_to_activate: float
|
||||||
|
|
||||||
|
|
||||||
class Gemma3ImageProcessor(BaseImageProcessor):
|
class Gemma3ImageProcessor(BaseImageProcessor):
|
||||||
|
@ -47,7 +47,7 @@ from ...video_utils import VideoInput
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Glm4vImageProcessorKwargs(ImagesKwargs):
|
class Glm4vImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
patch_size (`int`, *optional*, defaults to 14):
|
patch_size (`int`, *optional*, defaults to 14):
|
||||||
The spatial patch size of the vision encoder.
|
The spatial patch size of the vision encoder.
|
||||||
@ -57,9 +57,9 @@ class Glm4vImageProcessorKwargs(ImagesKwargs):
|
|||||||
The merge size of the vision encoder to llm encoder.
|
The merge size of the vision encoder to llm encoder.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
temporal_patch_size: Optional[int]
|
temporal_patch_size: int
|
||||||
merge_size: Optional[int]
|
merge_size: int
|
||||||
|
|
||||||
|
|
||||||
def smart_resize(
|
def smart_resize(
|
||||||
|
@ -36,12 +36,12 @@ from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
|
|||||||
from .image_processing_glm4v import smart_resize
|
from .image_processing_glm4v import smart_resize
|
||||||
|
|
||||||
|
|
||||||
class Glm4vVideoProcessorInitKwargs(VideosKwargs):
|
class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False):
|
||||||
max_image_size: Optional[dict[str, int]]
|
max_image_size: dict[str, int]
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
temporal_patch_size: Optional[int]
|
temporal_patch_size: int
|
||||||
merge_size: Optional[int]
|
merge_size: int
|
||||||
max_duration: Optional[int]
|
max_duration: int
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
|
@ -49,7 +49,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class GotOcr2ImageProcessorKwargs(ImagesKwargs):
|
class GotOcr2ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||||
@ -62,9 +62,9 @@ class GotOcr2ImageProcessorKwargs(ImagesKwargs):
|
|||||||
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
crop_to_patches: Optional[bool]
|
crop_to_patches: bool
|
||||||
min_patches: Optional[int]
|
min_patches: int
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
|
|
||||||
|
|
||||||
# Similar to image_processing_mllama.get_all_supported_aspect_ratios
|
# Similar to image_processing_mllama.get_all_supported_aspect_ratios
|
||||||
|
@ -36,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False):
|
|||||||
|
|
||||||
|
|
||||||
class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
|
class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
|
||||||
crop_to_patches: Optional[bool]
|
crop_to_patches: bool
|
||||||
min_patches: Optional[int]
|
min_patches: int
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
|
box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
|
||||||
color: Optional[str]
|
color: Optional[str]
|
||||||
num_image_tokens: Optional[int]
|
num_image_tokens: int
|
||||||
multi_page: Optional[bool]
|
multi_page: bool
|
||||||
|
|
||||||
|
|
||||||
class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
|
class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
@ -93,7 +93,7 @@ class AnnotationFormat(ExplicitEnum):
|
|||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
|
|
||||||
|
|
||||||
class GroundingDinoImageProcessorKwargs(ImagesKwargs):
|
class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
@ -109,9 +109,9 @@ class GroundingDinoImageProcessorKwargs(ImagesKwargs):
|
|||||||
Path to the directory containing the segmentation masks.
|
Path to the directory containing the segmentation masks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Union[str, AnnotationFormat]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: bool
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: bool
|
||||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
masks_path: Optional[Union[str, pathlib.Path]]
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
|||||||
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
|
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
|
||||||
|
|
||||||
class IdeficsImageProcessorKwargs(ImagesKwargs):
|
class IdeficsImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
transform (`Callable`, *optional*):
|
transform (`Callable`, *optional*):
|
||||||
A custom transform function that accepts a single image can be passed for training. For example,
|
A custom transform function that accepts a single image can be passed for training. For example,
|
||||||
@ -47,7 +47,7 @@ class IdeficsImageProcessorKwargs(ImagesKwargs):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
transform: Optional[Callable]
|
transform: Optional[Callable]
|
||||||
image_size: Optional[dict[str, int]]
|
image_size: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
def convert_to_rgb(image):
|
def convert_to_rgb(image):
|
||||||
|
@ -47,13 +47,13 @@ if is_vision_available():
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
class Idefics2ImageProcessorKwargs(ImagesKwargs):
|
class Idefics2ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_image_splitting (`bool`, *optional*, defaults to `False`):
|
do_image_splitting (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
|
Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_image_splitting: Optional[bool]
|
do_image_splitting: bool
|
||||||
|
|
||||||
|
|
||||||
def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
|
def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
|
||||||
|
@ -48,7 +48,7 @@ if is_vision_available():
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
class Idefics3ImageProcessorKwargs(ImagesKwargs):
|
class Idefics3ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_image_splitting (`bool`, *optional*, defaults to `True`):
|
do_image_splitting (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to split the image into sub-images concatenated with the original image. They are split into patches
|
Whether to split the image into sub-images concatenated with the original image. They are split into patches
|
||||||
@ -59,9 +59,9 @@ class Idefics3ImageProcessorKwargs(ImagesKwargs):
|
|||||||
Whether to return the row and column information of the images.
|
Whether to return the row and column information of the images.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_image_splitting: Optional[bool]
|
do_image_splitting: bool
|
||||||
max_image_size: Optional[dict[str, int]]
|
max_image_size: dict[str, int]
|
||||||
return_row_col_info: Optional[bool]
|
return_row_col_info: bool
|
||||||
|
|
||||||
|
|
||||||
def _resize_output_size_rescale_to_max_len(
|
def _resize_output_size_rescale_to_max_len(
|
||||||
|
@ -45,7 +45,7 @@ if is_torch_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ImageGPTImageProcessorKwargs(ImagesKwargs):
|
class ImageGPTImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
|
clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
|
||||||
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
|
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
|
||||||
@ -56,7 +56,7 @@ class ImageGPTImageProcessorKwargs(ImagesKwargs):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
|
clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
|
||||||
do_color_quantize: Optional[bool]
|
do_color_quantize: bool
|
||||||
|
|
||||||
|
|
||||||
def squared_euclidean_distance(a, b):
|
def squared_euclidean_distance(a, b):
|
||||||
|
@ -24,15 +24,11 @@ from torchvision.transforms.v2 import functional as F
|
|||||||
|
|
||||||
from ...image_processing_utils import BatchFeature
|
from ...image_processing_utils import BatchFeature
|
||||||
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
|
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
|
||||||
from ...processing_utils import Unpack, VideosKwargs
|
|
||||||
from ...utils import TensorType
|
from ...utils import TensorType
|
||||||
from ...video_processing_utils import BaseVideoProcessor
|
from ...video_processing_utils import BaseVideoProcessor
|
||||||
from ...video_utils import group_videos_by_shape, reorder_videos
|
from ...video_utils import group_videos_by_shape, reorder_videos
|
||||||
|
|
||||||
|
|
||||||
class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs): ...
|
|
||||||
|
|
||||||
|
|
||||||
class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
|
class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
|
||||||
resample = PILImageResampling.BICUBIC
|
resample = PILImageResampling.BICUBIC
|
||||||
image_mean = OPENAI_CLIP_MEAN
|
image_mean = OPENAI_CLIP_MEAN
|
||||||
@ -44,12 +40,8 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
|
|||||||
do_normalize = True
|
do_normalize = True
|
||||||
do_convert_rgb = True
|
do_convert_rgb = True
|
||||||
do_sample_frames = False # Set to False for BC, recommended to set `True` in new models
|
do_sample_frames = False # Set to False for BC, recommended to set `True` in new models
|
||||||
valid_kwargs = InstructBlipVideoVideoProcessorInitKwargs
|
|
||||||
model_input_names = ["pixel_values"]
|
model_input_names = ["pixel_values"]
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[InstructBlipVideoVideoProcessorInitKwargs]):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
|
|
||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
videos: list["torch.Tensor"],
|
videos: list["torch.Tensor"],
|
||||||
|
@ -27,7 +27,7 @@ from ...video_processing_utils import BaseVideoProcessor
|
|||||||
from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
|
from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
|
||||||
|
|
||||||
|
|
||||||
class InternVLVideoProcessorInitKwargs(VideosKwargs):
|
class InternVLVideoProcessorInitKwargs(VideosKwargs, total=False):
|
||||||
initial_shift: Union[bool, float, int]
|
initial_shift: Union[bool, float, int]
|
||||||
|
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class JanusImageProcessorKwargs(ImagesKwargs):
|
class JanusImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
min_size (`int`, *optional*, defaults to 14):
|
min_size (`int`, *optional*, defaults to 14):
|
||||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||||
|
@ -1289,7 +1289,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
|
|||||||
return generated_tokens
|
return generated_tokens
|
||||||
|
|
||||||
|
|
||||||
class JanusImageProcessorKwargs(ImagesKwargs):
|
class JanusImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
min_size (`int`, *optional*, defaults to 14):
|
min_size (`int`, *optional*, defaults to 14):
|
||||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||||
|
@ -33,15 +33,17 @@ BboxInput = Union[
|
|||||||
list[list[tuple[float, float, float]]],
|
list[list[tuple[float, float, float]]],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
NestedList = list[Union[Optional[int], "NestedList"]]
|
||||||
|
|
||||||
|
|
||||||
class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
|
class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
|
||||||
bboxes: Optional[list[float]]
|
bboxes: Optional[NestedList] # NOTE: hub validators can't accept `Sequence`
|
||||||
num_image_tokens: Optional[int]
|
num_image_tokens: int
|
||||||
first_image_token_id: Optional[int]
|
first_image_token_id: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
class Kosmos2TextKwargs(TextKwargs, total=False):
|
class Kosmos2TextKwargs(TextKwargs, total=False):
|
||||||
add_eos_token: Optional[bool]
|
add_eos_token: bool
|
||||||
|
|
||||||
|
|
||||||
class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False):
|
class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
@ -46,7 +46,7 @@ logger = logging.get_logger(__name__)
|
|||||||
DEFAULT_FONT_PATH = "ybelkada/fonts"
|
DEFAULT_FONT_PATH = "ybelkada/fonts"
|
||||||
|
|
||||||
|
|
||||||
class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
|
class Kosmos2_5ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
|
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
|
||||||
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
|
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
|
||||||
@ -55,8 +55,8 @@ class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
|
|||||||
[KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
|
[KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
patch_size: Optional[dict[str, int]]
|
patch_size: dict[str, int]
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
|
# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
|
||||||
|
@ -52,7 +52,7 @@ if is_pytesseract_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
|
class LayoutLMv2ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
apply_ocr (`bool`, *optional*, defaults to `True`):
|
apply_ocr (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
||||||
@ -66,7 +66,7 @@ class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
|
|||||||
`preprocess` method.
|
`preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
apply_ocr: Optional[bool]
|
apply_ocr: bool
|
||||||
ocr_lang: Optional[str]
|
ocr_lang: Optional[str]
|
||||||
tesseract_config: Optional[str]
|
tesseract_config: Optional[str]
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ if is_pytesseract_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
|
class LayoutLMv3ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
apply_ocr (`bool`, *optional*, defaults to `True`):
|
apply_ocr (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
|
||||||
@ -70,7 +70,7 @@ class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
|
|||||||
`preprocess` method.
|
`preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
apply_ocr: Optional[bool]
|
apply_ocr: bool
|
||||||
ocr_lang: Optional[str]
|
ocr_lang: Optional[str]
|
||||||
tesseract_config: Optional[str]
|
tesseract_config: Optional[str]
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import math
|
import math
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import Optional, Union
|
from typing import Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torchvision.transforms.v2 import functional as F
|
from torchvision.transforms.v2 import functional as F
|
||||||
@ -169,24 +169,24 @@ def pad_along_first_dim(
|
|||||||
return images, pixel_mask
|
return images, pixel_mask
|
||||||
|
|
||||||
|
|
||||||
class Lfm2VlImageProcessorKwargs(ImagesKwargs):
|
class Lfm2VlImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
downsample_factor (`int`, *optional*, defaults to `2`):
|
downsample_factor (`int`, *optional*, defaults to `2`):
|
||||||
The downsampling factor for images used when resizing the image.
|
The downsampling factor for images used when resizing the image.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
downsample_factor: Optional[int]
|
downsample_factor: int
|
||||||
do_image_splitting: Optional[bool]
|
do_image_splitting: bool
|
||||||
min_tiles: Optional[int]
|
min_tiles: int
|
||||||
max_tiles: Optional[int]
|
max_tiles: int
|
||||||
use_thumbnail: Optional[bool]
|
use_thumbnail: bool
|
||||||
min_image_tokens: Optional[int]
|
min_image_tokens: int
|
||||||
max_image_tokens: Optional[int]
|
max_image_tokens: int
|
||||||
encoder_patch_size: Optional[int]
|
encoder_patch_size: int
|
||||||
tile_size: Optional[int]
|
tile_size: int
|
||||||
max_pixels_tolerance: Optional[float]
|
max_pixels_tolerance: float
|
||||||
do_pad: Optional[bool]
|
do_pad: bool
|
||||||
return_row_col_info: Optional[bool]
|
return_row_col_info: bool
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
|
@ -18,9 +18,9 @@ from typing import Optional, Union
|
|||||||
from ...feature_extraction_utils import BatchFeature
|
from ...feature_extraction_utils import BatchFeature
|
||||||
from ...image_utils import ImageInput, make_nested_list_of_images
|
from ...image_utils import ImageInput, make_nested_list_of_images
|
||||||
from ...processing_utils import (
|
from ...processing_utils import (
|
||||||
ImagesKwargs,
|
|
||||||
ProcessingKwargs,
|
ProcessingKwargs,
|
||||||
ProcessorMixin,
|
ProcessorMixin,
|
||||||
|
TextKwargs,
|
||||||
Unpack,
|
Unpack,
|
||||||
)
|
)
|
||||||
from ...tokenization_utils_base import BatchEncoding, TextInput
|
from ...tokenization_utils_base import BatchEncoding, TextInput
|
||||||
@ -30,25 +30,12 @@ from ...utils import logging
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Lfm2VlImagesKwargs(ImagesKwargs, total=False):
|
class Lfm2VlTextKwargs(TextKwargs, total=False):
|
||||||
downsample_factor: Optional[int]
|
use_image_special_tokens: Optional[bool]
|
||||||
do_image_splitting: Optional[bool]
|
|
||||||
min_tiles: Optional[int]
|
|
||||||
max_tiles: Optional[int]
|
|
||||||
use_thumbnail: Optional[bool]
|
|
||||||
min_image_tokens: Optional[int]
|
|
||||||
max_image_tokens: Optional[int]
|
|
||||||
encoder_patch_size: Optional[int]
|
|
||||||
tile_size: Optional[int]
|
|
||||||
max_pixels_tolerance: Optional[float]
|
|
||||||
patch_size: Optional[int]
|
|
||||||
do_pad: Optional[bool]
|
|
||||||
return_row_col_info: Optional[bool]
|
|
||||||
|
|
||||||
|
|
||||||
class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
|
class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
images_kwargs: Lfm2VlImagesKwargs
|
text_kwargs: Lfm2VlTextKwargs
|
||||||
|
|
||||||
_defaults = {
|
_defaults = {
|
||||||
"images_kwargs": {
|
"images_kwargs": {
|
||||||
"return_row_col_info": True,
|
"return_row_col_info": True,
|
||||||
@ -75,8 +62,6 @@ class Lfm2VlProcessor(ProcessorMixin):
|
|||||||
An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
|
An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
|
||||||
chat_template (`str`, *optional*):
|
chat_template (`str`, *optional*):
|
||||||
A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
|
A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
|
||||||
use_image_special_tokens (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to use image special tokens or not when processing.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
attributes = ["image_processor", "tokenizer"]
|
attributes = ["image_processor", "tokenizer"]
|
||||||
@ -88,12 +73,10 @@ class Lfm2VlProcessor(ProcessorMixin):
|
|||||||
image_processor,
|
image_processor,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
chat_template: Optional[str] = None,
|
chat_template: Optional[str] = None,
|
||||||
use_image_special_tokens: Optional[bool] = True,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.image_token = tokenizer.image_token
|
self.image_token = tokenizer.image_token
|
||||||
self.image_token_id = tokenizer.image_token_id
|
self.image_token_id = tokenizer.image_token_id
|
||||||
self.use_image_special_tokens = use_image_special_tokens
|
|
||||||
self.image_start_token = tokenizer.image_start_token
|
self.image_start_token = tokenizer.image_start_token
|
||||||
self.image_end_token = tokenizer.image_end_token
|
self.image_end_token = tokenizer.image_end_token
|
||||||
self.image_thumbnail_token = tokenizer.image_thumbnail
|
self.image_thumbnail_token = tokenizer.image_thumbnail
|
||||||
|
@ -308,7 +308,7 @@ def get_best_fit(
|
|||||||
return optimal_canvas
|
return optimal_canvas
|
||||||
|
|
||||||
|
|
||||||
class Llama4ImageProcessorKwargs(ImagesKwargs):
|
class Llama4ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
max_patches (`int`, *optional*, defaults to 16):
|
max_patches (`int`, *optional*, defaults to 16):
|
||||||
The maximum number of patches to be extracted from the image.
|
The maximum number of patches to be extracted from the image.
|
||||||
@ -320,8 +320,8 @@ class Llama4ImageProcessorKwargs(ImagesKwargs):
|
|||||||
but never upsample, unless the image is smaller than the patch size.
|
but never upsample, unless the image is smaller than the patch size.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
resize_to_max_canvas: Optional[bool]
|
resize_to_max_canvas: bool
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
|
@ -59,7 +59,7 @@ if is_vision_available():
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
class LlavaNextImageProcessorKwargs(ImagesKwargs):
|
class LlavaNextImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
image_grid_pinpoints (`list[list[int]]`, *optional*):
|
image_grid_pinpoints (`list[list[int]]`, *optional*):
|
||||||
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
||||||
@ -67,7 +67,7 @@ class LlavaNextImageProcessorKwargs(ImagesKwargs):
|
|||||||
method.
|
method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image_grid_pinpoints: Optional[list[list[int]]]
|
image_grid_pinpoints: list[list[int]]
|
||||||
|
|
||||||
|
|
||||||
def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]:
|
def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]:
|
||||||
|
@ -58,7 +58,7 @@ if is_vision_available():
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
class LlavaOnevisionImageProcessorKwargs(ImagesKwargs):
|
class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
image_grid_pinpoints (`list[list[int]]`, *optional*):
|
image_grid_pinpoints (`list[list[int]]`, *optional*):
|
||||||
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
||||||
@ -66,7 +66,7 @@ class LlavaOnevisionImageProcessorKwargs(ImagesKwargs):
|
|||||||
method.
|
method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image_grid_pinpoints: Optional[list[list[int]]]
|
image_grid_pinpoints: list[list[int]]
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
|
# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
|
||||||
|
@ -76,8 +76,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
|||||||
batch_num_images = [1] * len(images)
|
batch_num_images = [1] * len(images)
|
||||||
else:
|
else:
|
||||||
batch_num_images = [1]
|
batch_num_images = [1]
|
||||||
kwargs["batch_num_images"] = batch_num_images
|
return super().preprocess(images, batch_num_images, **kwargs)
|
||||||
return super().preprocess(images, **kwargs)
|
|
||||||
|
|
||||||
def _resize_for_patching(
|
def _resize_for_patching(
|
||||||
self,
|
self,
|
||||||
@ -202,6 +201,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
|||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
images: list["torch.Tensor"],
|
images: list["torch.Tensor"],
|
||||||
|
batch_num_images: list[int],
|
||||||
do_resize: bool,
|
do_resize: bool,
|
||||||
size: SizeDict,
|
size: SizeDict,
|
||||||
image_grid_pinpoints: list[list[int]],
|
image_grid_pinpoints: list[list[int]],
|
||||||
@ -214,7 +214,6 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
|||||||
image_mean: Optional[Union[float, list[float]]],
|
image_mean: Optional[Union[float, list[float]]],
|
||||||
image_std: Optional[Union[float, list[float]]],
|
image_std: Optional[Union[float, list[float]]],
|
||||||
do_pad: bool,
|
do_pad: bool,
|
||||||
batch_num_images: list[int],
|
|
||||||
disable_grouping: Optional[bool],
|
disable_grouping: Optional[bool],
|
||||||
return_tensors: Optional[Union[str, TensorType]],
|
return_tensors: Optional[Union[str, TensorType]],
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
@ -35,7 +35,7 @@ from transformers.models.llava_next_video.modeling_llava_next_video import (
|
|||||||
|
|
||||||
from ...cache_utils import Cache
|
from ...cache_utils import Cache
|
||||||
from ...image_processing_utils import BatchFeature
|
from ...image_processing_utils import BatchFeature
|
||||||
from ...image_processing_utils_fast import group_images_by_shape, reorder_images
|
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
|
||||||
from ...image_utils import (
|
from ...image_utils import (
|
||||||
OPENAI_CLIP_MEAN,
|
OPENAI_CLIP_MEAN,
|
||||||
OPENAI_CLIP_STD,
|
OPENAI_CLIP_STD,
|
||||||
@ -128,12 +128,12 @@ class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
|
|||||||
batch_num_images = [1] * len(images)
|
batch_num_images = [1] * len(images)
|
||||||
else:
|
else:
|
||||||
batch_num_images = [1]
|
batch_num_images = [1]
|
||||||
kwargs["batch_num_images"] = batch_num_images
|
return BaseImageProcessorFast.preprocess(images, batch_num_images, **kwargs)
|
||||||
return super().preprocess(images, **kwargs)
|
|
||||||
|
|
||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
images: list["torch.Tensor"],
|
images: list["torch.Tensor"],
|
||||||
|
batch_num_images: list[int],
|
||||||
do_resize: bool,
|
do_resize: bool,
|
||||||
size: SizeDict,
|
size: SizeDict,
|
||||||
image_grid_pinpoints: list[list[int]],
|
image_grid_pinpoints: list[list[int]],
|
||||||
@ -146,7 +146,6 @@ class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
|
|||||||
image_mean: Optional[Union[float, list[float]]],
|
image_mean: Optional[Union[float, list[float]]],
|
||||||
image_std: Optional[Union[float, list[float]]],
|
image_std: Optional[Union[float, list[float]]],
|
||||||
do_pad: bool,
|
do_pad: bool,
|
||||||
batch_num_images: list[int],
|
|
||||||
disable_grouping: Optional[bool],
|
disable_grouping: Optional[bool],
|
||||||
return_tensors: Optional[Union[str, TensorType]],
|
return_tensors: Optional[Union[str, TensorType]],
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
@ -61,7 +61,7 @@ if is_torch_available():
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
class Mask2FormerImageProcessorKwargs(ImagesKwargs):
|
class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
ignore_index (`int`, *optional*):
|
ignore_index (`int`, *optional*):
|
||||||
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
||||||
@ -74,9 +74,9 @@ class Mask2FormerImageProcessorKwargs(ImagesKwargs):
|
|||||||
The number of labels in the segmentation map.
|
The number of labels in the segmentation map.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
ignore_index: Optional[int]
|
ignore_index: Optional[int]
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
num_labels: Optional[int]
|
num_labels: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
|
@ -67,7 +67,7 @@ if is_torch_available():
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
class MaskFormerImageProcessorKwargs(ImagesKwargs):
|
class MaskFormerImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
ignore_index (`int`, *optional*):
|
ignore_index (`int`, *optional*):
|
||||||
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
||||||
@ -80,9 +80,9 @@ class MaskFormerImageProcessorKwargs(ImagesKwargs):
|
|||||||
The number of labels in the segmentation map.
|
The number of labels in the segmentation map.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
ignore_index: Optional[int]
|
ignore_index: Optional[int]
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
num_labels: Optional[int]
|
num_labels: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,13 +50,13 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MllamaImageProcessorKwargs(ImagesKwargs):
|
class MllamaImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
max_image_tiles (`int`, *optional*):
|
max_image_tiles (`int`, *optional*):
|
||||||
The maximum number of tiles allowed.
|
The maximum number of tiles allowed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
max_image_tiles: Optional[int]
|
max_image_tiles: int
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=10)
|
@lru_cache(maxsize=10)
|
||||||
|
@ -258,9 +258,7 @@ class MllamaProcessor(ProcessorMixin):
|
|||||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
images_kwargs = output_kwargs["images_kwargs"]
|
|
||||||
|
|
||||||
data = {}
|
data = {}
|
||||||
if text is not None:
|
if text is not None:
|
||||||
@ -306,7 +304,7 @@ class MllamaProcessor(ProcessorMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_features = self.image_processor(images, **images_kwargs)
|
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||||
num_tiles = image_features.pop("num_tiles")
|
num_tiles = image_features.pop("num_tiles")
|
||||||
data.update(image_features)
|
data.update(image_features)
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ from ...utils.import_utils import requires
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MobileNetV2ImageProcessorKwargs(ImagesKwargs):
|
class MobileNetV2ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||||
@ -59,7 +59,7 @@ class MobileNetV2ImageProcessorKwargs(ImagesKwargs):
|
|||||||
ADE20k). The background label will be replaced by 255.
|
ADE20k). The background label will be replaced by 255.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
|
|
||||||
|
|
||||||
@requires(backends=("vision",))
|
@requires(backends=("vision",))
|
||||||
|
@ -53,7 +53,7 @@ if is_torch_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MobileVitImageProcessorKwargs(ImagesKwargs):
|
class MobileVitImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
|
do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
|
||||||
Whether to flip the color channels from RGB to BGR or vice versa.
|
Whether to flip the color channels from RGB to BGR or vice versa.
|
||||||
@ -63,8 +63,8 @@ class MobileVitImageProcessorKwargs(ImagesKwargs):
|
|||||||
ADE20k). The background label will be replaced by 255.
|
ADE20k). The background label will be replaced by 255.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_flip_channel_order: Optional[bool]
|
do_flip_channel_order: bool
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
|
|
||||||
|
|
||||||
@requires(backends=("vision",))
|
@requires(backends=("vision",))
|
||||||
|
@ -52,7 +52,7 @@ if is_vision_available():
|
|||||||
import PIL
|
import PIL
|
||||||
|
|
||||||
|
|
||||||
class NougatImageProcessorKwargs(ImagesKwargs):
|
class NougatImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
do_crop_margin (`bool`, *optional*, defaults to `True`):
|
do_crop_margin (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to crop the image margins.
|
Whether to crop the image margins.
|
||||||
@ -62,9 +62,9 @@ class NougatImageProcessorKwargs(ImagesKwargs):
|
|||||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_crop_margin: Optional[bool]
|
do_crop_margin: bool
|
||||||
do_thumbnail: Optional[bool]
|
do_thumbnail: bool
|
||||||
do_align_long_axis: Optional[bool]
|
do_align_long_axis: bool
|
||||||
|
|
||||||
|
|
||||||
class NougatImageProcessor(BaseImageProcessor):
|
class NougatImageProcessor(BaseImageProcessor):
|
||||||
|
@ -64,7 +64,7 @@ if is_torch_available():
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
class OneFormerImageProcessorKwargs(ImagesKwargs):
|
class OneFormerImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`):
|
repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`):
|
||||||
Path to a local directory or Hugging Face Hub repository containing model metadata.
|
Path to a local directory or Hugging Face Hub repository containing model metadata.
|
||||||
@ -85,7 +85,7 @@ class OneFormerImageProcessorKwargs(ImagesKwargs):
|
|||||||
num_text: Optional[int]
|
num_text: Optional[int]
|
||||||
num_labels: Optional[int]
|
num_labels: Optional[int]
|
||||||
ignore_index: Optional[int]
|
ignore_index: Optional[int]
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
|
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
|
||||||
|
@ -44,7 +44,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Ovis2ImageProcessorKwargs(ImagesKwargs):
|
class Ovis2ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
crop_to_patches (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
|
||||||
@ -61,10 +61,10 @@ class Ovis2ImageProcessorKwargs(ImagesKwargs):
|
|||||||
`preprocess` method.
|
`preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
crop_to_patches: Optional[bool]
|
crop_to_patches: bool
|
||||||
min_patches: Optional[int]
|
min_patches: int
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
use_covering_area_grid: Optional[bool]
|
use_covering_area_grid: bool
|
||||||
|
|
||||||
|
|
||||||
# Similar to image_processing_mllama.get_all_supported_aspect_ratios
|
# Similar to image_processing_mllama.get_all_supported_aspect_ratios
|
||||||
|
@ -42,7 +42,7 @@ from ...utils import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PerceptionLMImageProcessorKwargs(ImagesKwargs):
|
class PerceptionLMImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`):
|
vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`):
|
||||||
Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for
|
Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for
|
||||||
@ -54,8 +54,8 @@ class PerceptionLMImageProcessorKwargs(ImagesKwargs):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
vision_input_type: Optional[str]
|
vision_input_type: Optional[str]
|
||||||
tile_size: Optional[int]
|
tile_size: int
|
||||||
max_num_tiles: Optional[int]
|
max_num_tiles: int
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
@ -68,7 +68,7 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
|
|||||||
do_rescale = True
|
do_rescale = True
|
||||||
do_normalize = True
|
do_normalize = True
|
||||||
do_convert_rgb = True
|
do_convert_rgb = True
|
||||||
vision_input_type = "thumb+tail"
|
vision_input_type = "thumb+tile"
|
||||||
tile_size = 448
|
tile_size = 448
|
||||||
max_num_tiles = 36
|
max_num_tiles = 36
|
||||||
size = {"width": 448, "height": 448} # for backward compatibility in tests
|
size = {"width": 448, "height": 448} # for backward compatibility in tests
|
||||||
|
@ -35,7 +35,7 @@ from ...utils import (
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Phi4MultimodalImageProcessorKwargs(ImagesKwargs):
|
class Phi4MultimodalImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
patch_size (`int`, *optional*):
|
patch_size (`int`, *optional*):
|
||||||
The size of the patch.
|
The size of the patch.
|
||||||
@ -43,8 +43,8 @@ class Phi4MultimodalImageProcessorKwargs(ImagesKwargs):
|
|||||||
The maximum number of crops per image.
|
The maximum number of crops per image.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
dynamic_hd: Optional[int]
|
dynamic_hd: int
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
|
@ -49,7 +49,7 @@ logger = logging.get_logger(__name__)
|
|||||||
DEFAULT_FONT_PATH = "ybelkada/fonts"
|
DEFAULT_FONT_PATH = "ybelkada/fonts"
|
||||||
|
|
||||||
|
|
||||||
class Pix2StructImageProcessorKwargs(ImagesKwargs):
|
class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
max_patches (`int`, *optional*):
|
max_patches (`int`, *optional*):
|
||||||
Maximum number of patches to extract.
|
Maximum number of patches to extract.
|
||||||
@ -57,7 +57,7 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs):
|
|||||||
Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
|
Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
max_patches: Optional[int]
|
max_patches: int
|
||||||
header_text: Optional[Union[list[str], str]]
|
header_text: Optional[Union[list[str], str]]
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,13 +50,13 @@ if is_vision_available():
|
|||||||
import PIL
|
import PIL
|
||||||
|
|
||||||
|
|
||||||
class PixtralImageProcessorKwargs(ImagesKwargs):
|
class PixtralImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
|
patch_size (`Union[dict[str, int], int]` *optional*, defaults to `{"height": 16, "width": 16}`):
|
||||||
Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
|
Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
patch_size: Optional[dict[str, int]]
|
patch_size: Union[dict[str, int], int]
|
||||||
|
|
||||||
|
|
||||||
# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
|
# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
|
||||||
|
@ -48,13 +48,13 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PoolFormerImageProcessorKwargs(ImagesKwargs):
|
class PoolFormerImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
crop_pct (`float`, *optional*, defaults to `self.crop_pct`):
|
crop_pct (`float`, *optional*, defaults to `self.crop_pct`):
|
||||||
Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
|
Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
crop_pct: Optional[float]
|
crop_pct: float
|
||||||
|
|
||||||
|
|
||||||
class PoolFormerImageProcessor(BaseImageProcessor):
|
class PoolFormerImageProcessor(BaseImageProcessor):
|
||||||
|
@ -54,7 +54,7 @@ if is_torch_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs):
|
class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
keep_aspect_ratio (`bool`, *optional*):
|
keep_aspect_ratio (`bool`, *optional*):
|
||||||
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
|
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
|
||||||
@ -64,10 +64,10 @@ class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs):
|
|||||||
Scale factor to convert the prompt depth to meters.
|
Scale factor to convert the prompt depth to meters.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
keep_aspect_ratio: Optional[bool]
|
keep_aspect_ratio: bool
|
||||||
ensure_multiple_of: Optional[int]
|
ensure_multiple_of: int
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
prompt_scale_to_meter: Optional[float]
|
prompt_scale_to_meter: float
|
||||||
|
|
||||||
|
|
||||||
def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
|
def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
|
||||||
|
@ -32,17 +32,17 @@ from ...video_utils import VideoInput
|
|||||||
|
|
||||||
# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
|
# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
|
||||||
# and does not use them in video processor class
|
# and does not use them in video processor class
|
||||||
class Qwen2_5_OmniVideosKwargs(VideosKwargs):
|
class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False):
|
||||||
min_pixels: Optional[int]
|
min_pixels: int
|
||||||
max_pixels: Optional[int]
|
max_pixels: int
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
temporal_patch_size: Optional[int]
|
temporal_patch_size: int
|
||||||
merge_size: Optional[int]
|
merge_size: int
|
||||||
min_frames: Optional[int]
|
min_frames: int
|
||||||
max_frames: Optional[int]
|
max_frames: int
|
||||||
use_audio_in_video: Optional[bool]
|
use_audio_in_video: bool
|
||||||
seconds_per_chunk: Optional[float]
|
seconds_per_chunk: float
|
||||||
position_id_per_seconds: Optional[int]
|
position_id_per_seconds: Union[int, float]
|
||||||
|
|
||||||
|
|
||||||
class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
|
class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
@ -52,7 +52,7 @@ from ...video_utils import VideoInput, make_batched_videos
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLImageProcessorKwargs(ImagesKwargs):
|
class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
||||||
The min pixels of the image to resize the image.
|
The min pixels of the image to resize the image.
|
||||||
@ -66,11 +66,11 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs):
|
|||||||
The merge size of the vision encoder to llm encoder.
|
The merge size of the vision encoder to llm encoder.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
min_pixels: Optional[int]
|
min_pixels: int
|
||||||
max_pixels: Optional[int]
|
max_pixels: int
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
temporal_patch_size: Optional[int]
|
temporal_patch_size: int
|
||||||
merge_size: Optional[int]
|
merge_size: int
|
||||||
|
|
||||||
|
|
||||||
def smart_resize(
|
def smart_resize(
|
||||||
|
@ -41,14 +41,14 @@ from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
|
|||||||
from .image_processing_qwen2_vl import smart_resize
|
from .image_processing_qwen2_vl import smart_resize
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLVideoProcessorInitKwargs(VideosKwargs):
|
class Qwen2VLVideoProcessorInitKwargs(VideosKwargs, total=False):
|
||||||
min_pixels: Optional[int]
|
min_pixels: int
|
||||||
max_pixels: Optional[int]
|
max_pixels: int
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
temporal_patch_size: Optional[int]
|
temporal_patch_size: int
|
||||||
merge_size: Optional[int]
|
merge_size: int
|
||||||
min_frames: Optional[int]
|
min_frames: int
|
||||||
max_frames: Optional[int]
|
max_frames: int
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import re
|
import re
|
||||||
from typing import Optional
|
from typing import Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@ -34,17 +34,17 @@ from ...video_utils import VideoInput, make_batched_videos
|
|||||||
|
|
||||||
# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
|
# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
|
||||||
# and does not use them in video processor class
|
# and does not use them in video processor class
|
||||||
class Qwen3OmniMoeVideosKwargs(VideosKwargs):
|
class Qwen3OmniMoeVideosKwargs(VideosKwargs, total=False):
|
||||||
min_pixels: Optional[int]
|
min_pixels: int
|
||||||
max_pixels: Optional[int]
|
max_pixels: int
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
temporal_patch_size: Optional[int]
|
temporal_patch_size: int
|
||||||
merge_size: Optional[int]
|
merge_size: int
|
||||||
min_frames: Optional[int]
|
min_frames: int
|
||||||
max_frames: Optional[int]
|
max_frames: int
|
||||||
use_audio_in_video: Optional[bool]
|
use_audio_in_video: bool
|
||||||
seconds_per_chunk: Optional[float]
|
seconds_per_chunk: float
|
||||||
position_id_per_seconds: Optional[int]
|
position_id_per_seconds: Union[int, float]
|
||||||
|
|
||||||
|
|
||||||
class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):
|
class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
@ -64,12 +64,12 @@ def smart_resize(
|
|||||||
return h_bar, w_bar
|
return h_bar, w_bar
|
||||||
|
|
||||||
|
|
||||||
class Qwen3VLVideoProcessorInitKwargs(VideosKwargs):
|
class Qwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False):
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
temporal_patch_size: Optional[int]
|
temporal_patch_size: int
|
||||||
merge_size: Optional[int]
|
merge_size: int
|
||||||
min_frames: Optional[int]
|
min_frames: int
|
||||||
max_frames: Optional[int]
|
max_frames: int
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
|
@ -68,7 +68,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
|||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
|
||||||
|
|
||||||
|
|
||||||
class RTDetrImageProcessorKwargs(ImagesKwargs):
|
class RTDetrImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
@ -84,9 +84,9 @@ class RTDetrImageProcessorKwargs(ImagesKwargs):
|
|||||||
Path to the directory containing the segmentation masks.
|
Path to the directory containing the segmentation masks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Union[str, AnnotationFormat]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: bool
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: bool
|
||||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
masks_path: Optional[Union[str, pathlib.Path]]
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ if is_torchvision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SamImageProcessorKwargs(ImagesKwargs):
|
class SamImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
mask_size (`dict[str, int]`, *optional*):
|
mask_size (`dict[str, int]`, *optional*):
|
||||||
The size `{"longest_edge": int}` to resize the segmentation maps to.
|
The size `{"longest_edge": int}` to resize the segmentation maps to.
|
||||||
@ -67,8 +67,8 @@ class SamImageProcessorKwargs(ImagesKwargs):
|
|||||||
map size provided for preprocessing.
|
map size provided for preprocessing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
mask_size: Optional[dict[str, int]]
|
mask_size: dict[str, int]
|
||||||
mask_pad_size: Optional[dict[str, int]]
|
mask_pad_size: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
class SamImageProcessor(BaseImageProcessor):
|
class SamImageProcessor(BaseImageProcessor):
|
||||||
|
@ -31,14 +31,14 @@ if is_torch_available():
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
class SamImagesKwargs(ImagesKwargs):
|
class SamImagesKwargs(ImagesKwargs, total=False):
|
||||||
segmentation_maps: Optional[ImageInput]
|
segmentation_maps: Optional[ImageInput]
|
||||||
input_points: Optional[list[list[float]]]
|
input_points: Optional[list[list[float]]]
|
||||||
input_labels: Optional[list[list[int]]]
|
input_labels: Optional[list[list[int]]]
|
||||||
input_boxes: Optional[list[list[list[float]]]]
|
input_boxes: Optional[list[list[list[float]]]]
|
||||||
point_pad_value: Optional[int]
|
point_pad_value: int
|
||||||
mask_size: Optional[dict[str, int]]
|
mask_size: dict[str, int]
|
||||||
mask_pad_size: Optional[dict[str, int]]
|
mask_pad_size: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
class SamProcessorKwargs(ProcessingKwargs, total=False):
|
class SamProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
@ -43,13 +43,13 @@ from ...processing_utils import ImagesKwargs, Unpack
|
|||||||
from ...utils import TensorType, auto_docstring
|
from ...utils import TensorType, auto_docstring
|
||||||
|
|
||||||
|
|
||||||
class Sam2FastImageProcessorKwargs(ImagesKwargs):
|
class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
mask_size (`dict[str, int]`, *optional*):
|
mask_size (`dict[str, int]`, *optional*):
|
||||||
The size `{"height": int, "width": int}` to resize the segmentation maps to.
|
The size `{"height": int, "width": int}` to resize the segmentation maps to.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
mask_size: Optional[dict[str, int]]
|
mask_size: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):
|
def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):
|
||||||
|
@ -70,13 +70,13 @@ from .configuration_sam2 import (
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Sam2FastImageProcessorKwargs(ImagesKwargs):
|
class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
mask_size (`dict[str, int]`, *optional*):
|
mask_size (`dict[str, int]`, *optional*):
|
||||||
The size `{"height": int, "width": int}` to resize the segmentation maps to.
|
The size `{"height": int, "width": int}` to resize the segmentation maps to.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
mask_size: Optional[dict[str, int]]
|
mask_size: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
|
@ -31,14 +31,14 @@ if is_torch_available():
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
class SamHQImagesKwargs(ImagesKwargs):
|
class SamHQImagesKwargs(ImagesKwargs, total=False):
|
||||||
segmentation_maps: Optional[ImageInput]
|
segmentation_maps: Optional[ImageInput]
|
||||||
input_points: Optional[list[list[float]]]
|
input_points: Optional[list[list[float]]]
|
||||||
input_labels: Optional[list[list[int]]]
|
input_labels: Optional[list[list[int]]]
|
||||||
input_boxes: Optional[list[list[list[float]]]]
|
input_boxes: Optional[list[list[list[float]]]]
|
||||||
point_pad_value: Optional[int]
|
point_pad_value: Optional[int]
|
||||||
mask_size: Optional[dict[str, int]]
|
mask_size: dict[str, int]
|
||||||
mask_pad_size: Optional[dict[str, int]]
|
mask_pad_size: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
class SamHQProcessorKwargs(ProcessingKwargs, total=False):
|
class SamHQProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
@ -55,7 +55,7 @@ if is_torch_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SegformerImageProcessorKwargs(ImagesKwargs):
|
class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||||
@ -63,7 +63,7 @@ class SegformerImageProcessorKwargs(ImagesKwargs):
|
|||||||
ADE20k). The background label will be replaced by 255.
|
ADE20k). The background label will be replaced by 255.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_reduce_labels: Optional[bool]
|
do_reduce_labels: bool
|
||||||
|
|
||||||
|
|
||||||
@requires(backends=("vision",))
|
@requires(backends=("vision",))
|
||||||
|
@ -48,7 +48,7 @@ if is_vision_available():
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
class Siglip2ImageProcessorKwargs(ImagesKwargs):
|
class Siglip2ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
patch_size (`int`, *optional*, defaults to 16):
|
patch_size (`int`, *optional*, defaults to 16):
|
||||||
The size (resolution) of each patch the image will be split to.
|
The size (resolution) of each patch the image will be split to.
|
||||||
@ -57,8 +57,8 @@ class Siglip2ImageProcessorKwargs(ImagesKwargs):
|
|||||||
and then padded in "patch" dimension to match this number exactly.
|
and then padded in "patch" dimension to match this number exactly.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
patch_size: Optional[int]
|
patch_size: int
|
||||||
max_num_patches: Optional[int]
|
max_num_patches: int
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=256)
|
@lru_cache(maxsize=256)
|
||||||
|
@ -53,7 +53,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SmolVLMImageProcessorKwargs(ImagesKwargs):
|
class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
do_image_splitting (`bool`, *optional*, defaults to `True`):
|
do_image_splitting (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to split the image into sub-images concatenated with the original image. They are split into patches
|
Whether to split the image into sub-images concatenated with the original image. They are split into patches
|
||||||
@ -64,9 +64,9 @@ class SmolVLMImageProcessorKwargs(ImagesKwargs):
|
|||||||
Whether to return the row and column information of the images.
|
Whether to return the row and column information of the images.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_image_splitting: Optional[bool]
|
do_image_splitting: bool
|
||||||
max_image_size: Optional[dict[str, int]]
|
max_image_size: dict[str, int]
|
||||||
return_row_col_info: Optional[bool]
|
return_row_col_info: bool
|
||||||
|
|
||||||
|
|
||||||
MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum
|
MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum
|
||||||
|
@ -90,8 +90,8 @@ def get_resize_output_image_size(
|
|||||||
return height, width
|
return height, width
|
||||||
|
|
||||||
|
|
||||||
class SmolVLMVideoProcessorInitKwargs(VideosKwargs):
|
class SmolVLMVideoProcessorInitKwargs(VideosKwargs, total=False):
|
||||||
max_image_size: Optional[dict[str, int]]
|
max_image_size: dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
class SmolVLMVideoProcessor(BaseVideoProcessor):
|
class SmolVLMVideoProcessor(BaseVideoProcessor):
|
||||||
|
@ -46,13 +46,13 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SuperPointImageProcessorKwargs(ImagesKwargs):
|
class SuperPointImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
do_grayscale (`bool`, *optional*, defaults to `True`):
|
do_grayscale (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_grayscale: Optional[bool] = True
|
do_grayscale: bool
|
||||||
|
|
||||||
|
|
||||||
def is_grayscale(
|
def is_grayscale(
|
||||||
|
@ -38,8 +38,8 @@ from ...utils.deprecation import deprecate_kwarg
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Swin2SRImageProcessorKwargs(ImagesKwargs):
|
class Swin2SRImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
|
|
||||||
|
|
||||||
class Swin2SRImageProcessor(BaseImageProcessor):
|
class Swin2SRImageProcessor(BaseImageProcessor):
|
||||||
|
@ -49,8 +49,8 @@ if is_vision_available():
|
|||||||
import PIL
|
import PIL
|
||||||
|
|
||||||
|
|
||||||
class TextNetImageProcessorKwargs(ImagesKwargs):
|
class TextNetImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
|
|
||||||
|
|
||||||
class TextNetImageProcessor(BaseImageProcessor):
|
class TextNetImageProcessor(BaseImageProcessor):
|
||||||
|
@ -50,7 +50,7 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TvpImageProcessorKwargs(ImagesKwargs):
|
class TvpImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
do_flip_channel_order (`bool`, *optional*):
|
do_flip_channel_order (`bool`, *optional*):
|
||||||
Whether to flip the channel order of the image from RGB to BGR.
|
Whether to flip the channel order of the image from RGB to BGR.
|
||||||
@ -60,7 +60,7 @@ class TvpImageProcessorKwargs(ImagesKwargs):
|
|||||||
Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`.
|
Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_flip_channel_order: Optional[bool]
|
do_flip_channel_order: bool
|
||||||
constant_values: Optional[Union[float, list[float]]]
|
constant_values: Optional[Union[float, list[float]]]
|
||||||
pad_mode: Optional[str]
|
pad_mode: Optional[str]
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
class UdopTextKwargs(TextKwargs, total=False):
|
class UdopTextKwargs(TextKwargs, total=False):
|
||||||
word_labels: Optional[Union[list[int], list[list[int]]]]
|
word_labels: Optional[Union[list[int], list[list[int]]]]
|
||||||
boxes: Union[list[list[int]], list[list[list[int]]]]
|
boxes: Optional[Union[list[list[int]], list[list[list[int]]]]]
|
||||||
|
|
||||||
|
|
||||||
class UdopProcessorKwargs(ProcessingKwargs, total=False):
|
class UdopProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
@ -47,8 +47,8 @@ if is_vision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ViltImageProcessorKwargs(ImagesKwargs):
|
class ViltImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
|
|
||||||
|
|
||||||
def max_across_indices(values: Iterable[Any]) -> list[Any]:
|
def max_across_indices(values: Iterable[Any]) -> list[Any]:
|
||||||
|
@ -41,8 +41,8 @@ from ...utils.deprecation import deprecate_kwarg
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class VitMatteImageProcessorKwargs(ImagesKwargs):
|
class VitMatteImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
size_divisor: Optional[int]
|
size_divisor: int
|
||||||
|
|
||||||
|
|
||||||
class VitMatteImageProcessor(BaseImageProcessor):
|
class VitMatteImageProcessor(BaseImageProcessor):
|
||||||
|
@ -81,7 +81,7 @@ logger = logging.get_logger(__name__)
|
|||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
|
|
||||||
|
|
||||||
class YolosImageProcessorKwargs(ImagesKwargs):
|
class YolosImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
r"""
|
r"""
|
||||||
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
||||||
@ -97,9 +97,9 @@ class YolosImageProcessorKwargs(ImagesKwargs):
|
|||||||
Path to the directory containing the segmentation masks.
|
Path to the directory containing the segmentation masks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Union[str, AnnotationFormat]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: bool
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: bool
|
||||||
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
masks_path: Optional[Union[str, pathlib.Path]]
|
||||||
|
|
||||||
|
@ -62,7 +62,7 @@ if is_torch_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ZoeDepthImageProcessorKwargs(ImagesKwargs):
|
class ZoeDepthImageProcessorKwargs(ImagesKwargs, total=False):
|
||||||
"""
|
"""
|
||||||
keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
|
keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
|
||||||
If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
|
If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
|
||||||
@ -77,8 +77,8 @@ class ZoeDepthImageProcessorKwargs(ImagesKwargs):
|
|||||||
Can be overridden by `ensure_multiple_of` in `preprocess`.
|
Can be overridden by `ensure_multiple_of` in `preprocess`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
keep_aspect_ratio: Optional[bool]
|
keep_aspect_ratio: bool
|
||||||
ensure_multiple_of: Optional[int]
|
ensure_multiple_of: int
|
||||||
|
|
||||||
|
|
||||||
def get_resize_output_image_size(
|
def get_resize_output_image_size(
|
||||||
|
@ -25,10 +25,11 @@ import typing
|
|||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional, TypedDict, TypeVar, Union
|
from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import typing_extensions
|
import typing_extensions
|
||||||
|
from huggingface_hub.dataclasses import validate_typed_dict
|
||||||
from huggingface_hub.errors import EntryNotFoundError
|
from huggingface_hub.errors import EntryNotFoundError
|
||||||
|
|
||||||
from .audio_utils import AudioInput, load_audio
|
from .audio_utils import AudioInput, load_audio
|
||||||
@ -36,13 +37,23 @@ from .dynamic_module_utils import custom_object_save
|
|||||||
from .feature_extraction_utils import BatchFeature
|
from .feature_extraction_utils import BatchFeature
|
||||||
from .image_utils import ChannelDimension, ImageInput, is_vision_available
|
from .image_utils import ChannelDimension, ImageInput, is_vision_available
|
||||||
from .utils.chat_template_utils import render_jinja_template
|
from .utils.chat_template_utils import render_jinja_template
|
||||||
from .video_utils import VideoInput, VideoMetadata
|
from .utils.type_validators import (
|
||||||
|
device_validator,
|
||||||
|
image_size_validator,
|
||||||
|
padding_validator,
|
||||||
|
positive_any_number,
|
||||||
|
positive_int,
|
||||||
|
resampling_validator,
|
||||||
|
tensor_type_validator,
|
||||||
|
truncation_validator,
|
||||||
|
video_metadata_validator,
|
||||||
|
)
|
||||||
|
from .video_utils import VideoInput, VideoMetadataType
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from .image_utils import PILImageResampling
|
from .image_utils import PILImageResampling
|
||||||
|
|
||||||
|
|
||||||
from .tokenization_utils_base import (
|
from .tokenization_utils_base import (
|
||||||
PaddingStrategy,
|
PaddingStrategy,
|
||||||
PreTokenizedInput,
|
PreTokenizedInput,
|
||||||
@ -72,8 +83,6 @@ from .utils.deprecation import deprecate_kwarg
|
|||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
|
||||||
|
|
||||||
from .modeling_utils import PreTrainedAudioTokenizerBase
|
from .modeling_utils import PreTrainedAudioTokenizerBase
|
||||||
|
|
||||||
|
|
||||||
@ -137,18 +146,22 @@ class TextKwargs(TypedDict, total=False):
|
|||||||
The side on which padding will be applied.
|
The side on which padding will be applied.
|
||||||
return_mm_token_type_ids (`bool`, *optional*):
|
return_mm_token_type_ids (`bool`, *optional*):
|
||||||
Whether to return multimodal token type ids indicating mm placeholder token positions.
|
Whether to return multimodal token type ids indicating mm placeholder token positions.
|
||||||
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||||
|
If set, will return tensors of a particular framework. Acceptable values are:
|
||||||
|
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||||
|
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
|
||||||
text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
|
text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
|
||||||
text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
|
text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
|
||||||
add_special_tokens: Optional[bool]
|
add_special_tokens: Optional[bool]
|
||||||
padding: Union[bool, str, PaddingStrategy]
|
padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
|
||||||
truncation: Union[bool, str, TruncationStrategy]
|
truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
|
||||||
max_length: Optional[int]
|
max_length: Annotated[Optional[int], positive_int()]
|
||||||
stride: Optional[int]
|
stride: Annotated[Optional[int], positive_int()]
|
||||||
is_split_into_words: Optional[bool]
|
is_split_into_words: Optional[bool]
|
||||||
pad_to_multiple_of: Optional[int]
|
pad_to_multiple_of: Annotated[Optional[int], positive_int()]
|
||||||
return_token_type_ids: Optional[bool]
|
return_token_type_ids: Optional[bool]
|
||||||
return_attention_mask: Optional[bool]
|
return_attention_mask: Optional[bool]
|
||||||
return_overflowing_tokens: Optional[bool]
|
return_overflowing_tokens: Optional[bool]
|
||||||
@ -156,9 +169,9 @@ class TextKwargs(TypedDict, total=False):
|
|||||||
return_offsets_mapping: Optional[bool]
|
return_offsets_mapping: Optional[bool]
|
||||||
return_length: Optional[bool]
|
return_length: Optional[bool]
|
||||||
verbose: Optional[bool]
|
verbose: Optional[bool]
|
||||||
padding_side: Optional[str]
|
padding_side: Optional[Literal["left", "right"]]
|
||||||
return_mm_token_type_ids: Optional[bool]
|
return_mm_token_type_ids: Optional[bool]
|
||||||
return_tensors: Optional[Union[str, TensorType]]
|
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
|
||||||
|
|
||||||
|
|
||||||
class ImagesKwargs(TypedDict, total=False):
|
class ImagesKwargs(TypedDict, total=False):
|
||||||
@ -175,6 +188,8 @@ class ImagesKwargs(TypedDict, total=False):
|
|||||||
Resize the shorter side of the input to `size["shortest_edge"]`.
|
Resize the shorter side of the input to `size["shortest_edge"]`.
|
||||||
crop_size (`dict[str, int]`, *optional*):
|
crop_size (`dict[str, int]`, *optional*):
|
||||||
Desired output size when applying center-cropping.
|
Desired output size when applying center-cropping.
|
||||||
|
do_convert_rgb (`bool`):
|
||||||
|
Whether to convert the video to RGB format.
|
||||||
resample (`PILImageResampling`, *optional*):
|
resample (`PILImageResampling`, *optional*):
|
||||||
Resampling filter to use if resizing the image.
|
Resampling filter to use if resizing the image.
|
||||||
do_rescale (`bool`, *optional*):
|
do_rescale (`bool`, *optional*):
|
||||||
@ -183,9 +198,9 @@ class ImagesKwargs(TypedDict, total=False):
|
|||||||
Scale factor to use if rescaling the image.
|
Scale factor to use if rescaling the image.
|
||||||
do_normalize (`bool`, *optional*):
|
do_normalize (`bool`, *optional*):
|
||||||
Whether to normalize the image.
|
Whether to normalize the image.
|
||||||
image_mean (`float` or `list[float]`, *optional*):
|
image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
|
||||||
Mean to use if normalizing the image.
|
Mean to use if normalizing the image.
|
||||||
image_std (`float` or `list[float]`, *optional*):
|
image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
|
||||||
Standard deviation to use if normalizing the image.
|
Standard deviation to use if normalizing the image.
|
||||||
do_pad (`bool`, *optional*):
|
do_pad (`bool`, *optional*):
|
||||||
Whether to pad the images in the batch.
|
Whether to pad the images in the batch.
|
||||||
@ -199,28 +214,32 @@ class ImagesKwargs(TypedDict, total=False):
|
|||||||
The channel dimension format for the input image.
|
The channel dimension format for the input image.
|
||||||
device (`Union[str, torch.Tensor]`, *optional*):
|
device (`Union[str, torch.Tensor]`, *optional*):
|
||||||
The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
|
The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
|
||||||
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||||
|
If set, will return tensors of a particular framework. Acceptable values are:
|
||||||
|
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||||
|
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||||
disable_grouping (`bool`, *optional*):
|
disable_grouping (`bool`, *optional*):
|
||||||
Whether to group images by shapes when processing or not, only relevant for fast image processing.
|
Whether to group images by shapes when processing or not, only relevant for fast image processing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_convert_rgb: Optional[bool]
|
do_convert_rgb: Optional[bool]
|
||||||
do_resize: Optional[bool]
|
do_resize: Optional[bool]
|
||||||
size: Optional[dict[str, int]]
|
size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
|
||||||
crop_size: Optional[dict[str, int]]
|
crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
|
||||||
resample: Optional[Union["PILImageResampling", int]]
|
resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
|
||||||
do_rescale: Optional[bool]
|
do_rescale: Optional[bool]
|
||||||
rescale_factor: Optional[float]
|
rescale_factor: Optional[float]
|
||||||
do_normalize: Optional[bool]
|
do_normalize: Optional[bool]
|
||||||
image_mean: Optional[Union[float, list[float]]]
|
image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
|
||||||
image_std: Optional[Union[float, list[float]]]
|
image_std: Optional[Union[float, list[float], tuple[float, ...]]]
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
pad_size: Optional[dict[str, int]]
|
pad_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
|
||||||
do_center_crop: Optional[bool]
|
do_center_crop: Optional[bool]
|
||||||
data_format: Optional[ChannelDimension]
|
data_format: Optional[Union[str, ChannelDimension]]
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]]
|
input_data_format: Optional[Union[str, ChannelDimension]]
|
||||||
device: Optional[Union[str, "torch.device"]]
|
device: Annotated[Optional[str], device_validator()]
|
||||||
|
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
|
||||||
disable_grouping: Optional[bool]
|
disable_grouping: Optional[bool]
|
||||||
return_tensors: Optional[Union[str, TensorType]]
|
|
||||||
|
|
||||||
|
|
||||||
class VideosKwargs(TypedDict, total=False):
|
class VideosKwargs(TypedDict, total=False):
|
||||||
@ -244,9 +263,9 @@ class VideosKwargs(TypedDict, total=False):
|
|||||||
Scale factor to use if rescaling the video.
|
Scale factor to use if rescaling the video.
|
||||||
do_normalize (`bool`, *optional*):
|
do_normalize (`bool`, *optional*):
|
||||||
Whether to normalize the video.
|
Whether to normalize the video.
|
||||||
image_mean (`float` or `list[float]`, *optional*):
|
image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
|
||||||
Mean to use if normalizing the video.
|
Mean to use if normalizing the video.
|
||||||
image_std (`float` or `list[float]`, *optional*):
|
image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
|
||||||
Standard deviation to use if normalizing the video.
|
Standard deviation to use if normalizing the video.
|
||||||
do_center_crop (`bool`, *optional*):
|
do_center_crop (`bool`, *optional*):
|
||||||
Whether to center crop the video.
|
Whether to center crop the video.
|
||||||
@ -268,32 +287,36 @@ class VideosKwargs(TypedDict, total=False):
|
|||||||
The channel dimension format for the input video.
|
The channel dimension format for the input video.
|
||||||
device (`Union[str, torch.Tensor]`, *optional*):
|
device (`Union[str, torch.Tensor]`, *optional*):
|
||||||
The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
|
The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
|
||||||
return_metadata (`ChannelDimension` or `str`, *optional*):
|
return_metadata (`bool`, *optional*):
|
||||||
Whether to return video metadata or not.
|
Whether to return video metadata or not.
|
||||||
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||||
|
If set, will return tensors of a particular framework. Acceptable values are:
|
||||||
|
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||||
|
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
do_convert_rgb: Optional[bool]
|
do_convert_rgb: Optional[bool]
|
||||||
do_resize: Optional[bool]
|
do_resize: Optional[bool]
|
||||||
size: Optional[dict[str, int]]
|
size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
|
||||||
default_to_square: Optional[bool]
|
default_to_square: Optional[bool]
|
||||||
resample: Optional["PILImageResampling"]
|
resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
|
||||||
do_rescale: Optional[bool]
|
do_rescale: Optional[bool]
|
||||||
rescale_factor: Optional[float]
|
rescale_factor: Optional[float]
|
||||||
do_normalize: Optional[bool]
|
do_normalize: Optional[bool]
|
||||||
image_mean: Optional[Union[float, list[float]]]
|
image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
|
||||||
image_std: Optional[Union[float, list[float]]]
|
image_std: Optional[Union[float, list[float], tuple[float, ...]]]
|
||||||
do_center_crop: Optional[bool]
|
do_center_crop: Optional[bool]
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
crop_size: Optional[dict[str, int]]
|
crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
|
||||||
data_format: Optional[ChannelDimension]
|
data_format: Optional[Union[str, ChannelDimension]]
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]]
|
input_data_format: Optional[Union[str, ChannelDimension]]
|
||||||
device: Optional[Union[str, "torch.device"]]
|
device: Annotated[Optional[str], device_validator()]
|
||||||
do_sample_frames: Optional[bool]
|
do_sample_frames: Optional[bool]
|
||||||
video_metadata: Optional[Union[VideoMetadata, dict]]
|
video_metadata: Annotated[Optional[VideoMetadataType], video_metadata_validator()]
|
||||||
fps: Optional[Union[int, float]]
|
fps: Annotated[Optional[Union[int, float]], positive_any_number()]
|
||||||
num_frames: Optional[int]
|
num_frames: Annotated[Optional[int], positive_int()]
|
||||||
return_metadata: Optional[bool]
|
return_metadata: Optional[bool]
|
||||||
return_tensors: Optional[Union[str, TensorType]]
|
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
|
||||||
|
|
||||||
|
|
||||||
class AudioKwargs(TypedDict, total=False):
|
class AudioKwargs(TypedDict, total=False):
|
||||||
@ -324,16 +347,20 @@ class AudioKwargs(TypedDict, total=False):
|
|||||||
If set, will pad the sequence to a multiple of the provided value.
|
If set, will pad the sequence to a multiple of the provided value.
|
||||||
return_attention_mask (`bool`, *optional*):
|
return_attention_mask (`bool`, *optional*):
|
||||||
Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
|
Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
|
||||||
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||||
|
If set, will return tensors of a particular framework. Acceptable values are:
|
||||||
|
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||||
|
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sampling_rate: Optional[int]
|
sampling_rate: Annotated[Optional[int], positive_int()]
|
||||||
raw_speech: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]]
|
raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]]
|
||||||
padding: Optional[Union[bool, str, PaddingStrategy]]
|
padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
|
||||||
max_length: Optional[int]
|
max_length: Annotated[Optional[int], positive_int()]
|
||||||
truncation: Optional[bool]
|
truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
|
||||||
pad_to_multiple_of: Optional[int]
|
pad_to_multiple_of: Annotated[Optional[int], positive_int()]
|
||||||
return_attention_mask: Optional[bool]
|
return_attention_mask: Optional[bool]
|
||||||
return_tensors: Optional[Union[str, TensorType]]
|
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
|
||||||
|
|
||||||
|
|
||||||
class ProcessingKwargs(TypedDict, total=False):
|
class ProcessingKwargs(TypedDict, total=False):
|
||||||
@ -1361,6 +1388,18 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
|
f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items():
|
||||||
|
if key in map_preprocessor_kwargs:
|
||||||
|
preprocessor = getattr(self, map_preprocessor_kwargs[key], None)
|
||||||
|
if preprocessor is None or getattr(preprocessor, "valid_kwargs", None) is None:
|
||||||
|
continue
|
||||||
|
preprocessor_typed_dict_obj = getattr(preprocessor, "valid_kwargs")
|
||||||
|
typed_dict_obj = TypedDict(
|
||||||
|
"merged_typed_dict",
|
||||||
|
{**preprocessor_typed_dict_obj.__annotations__, **typed_dict_obj.__annotations__},
|
||||||
|
total=False,
|
||||||
|
)
|
||||||
|
validate_typed_dict(typed_dict_obj, output_kwargs[key])
|
||||||
return output_kwargs
|
return output_kwargs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
115
src/transformers/utils/type_validators.py
Normal file
115
src/transformers/utils/type_validators.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
from collections.abc import Sequence
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy
|
||||||
|
from ..video_utils import VideoMetadataType
|
||||||
|
from .generic import TensorType
|
||||||
|
from .import_utils import is_vision_available
|
||||||
|
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from ..image_utils import PILImageResampling
|
||||||
|
|
||||||
|
|
||||||
|
def positive_any_number(value: Optional[Union[int, float]] = None):
|
||||||
|
if value is not None and (not isinstance(value, (int, float)) or not value >= 0):
|
||||||
|
raise ValueError(f"Value must be a positive integer or floating number, got {value}")
|
||||||
|
|
||||||
|
|
||||||
|
def positive_int(value: Optional[int] = None):
|
||||||
|
if value is not None and (not isinstance(value, int) or not value >= 0):
|
||||||
|
raise ValueError(f"Value must be a positive integer, got {value}")
|
||||||
|
|
||||||
|
|
||||||
|
def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None):
|
||||||
|
possible_names = ["longest", "max_length", "do_not_pad"]
|
||||||
|
if value is None:
|
||||||
|
pass
|
||||||
|
elif not isinstance(value, (bool, str, PaddingStrategy)):
|
||||||
|
raise ValueError("Value for padding must be either a boolean, a string or a `PaddingStrategy`")
|
||||||
|
elif isinstance(value, str) and value not in possible_names:
|
||||||
|
raise ValueError(f"If padding is a string, the value must be one of {possible_names}")
|
||||||
|
|
||||||
|
|
||||||
|
def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = None):
|
||||||
|
possible_names = ["only_first", "only_second", "longest_first", "do_not_truncate"]
|
||||||
|
if value is None:
|
||||||
|
pass
|
||||||
|
elif not isinstance(value, (bool, str, TruncationStrategy)):
|
||||||
|
raise ValueError("Value for truncation must be either a boolean, a string or a `TruncationStrategy`")
|
||||||
|
elif isinstance(value, str) and value not in possible_names:
|
||||||
|
raise ValueError(f"If truncation is a string, value must be one of {possible_names}")
|
||||||
|
|
||||||
|
|
||||||
|
def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int]]] = None):
|
||||||
|
possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"]
|
||||||
|
if value is None:
|
||||||
|
pass
|
||||||
|
elif isinstance(value, dict) and any(k not in possible_keys for k in value.keys()):
|
||||||
|
raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}")
|
||||||
|
|
||||||
|
|
||||||
|
def device_validator(value: Optional[Union[str, int]] = None):
|
||||||
|
possible_names = ["cpu", "cuda", "xla", "xpu", "mps", "meta"]
|
||||||
|
if value is None:
|
||||||
|
pass
|
||||||
|
elif isinstance(value, int) and value < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"If device is an integer, the value must be a strictly positive integer but got device={value}"
|
||||||
|
)
|
||||||
|
elif isinstance(value, str) and value.split(":")[0] not in possible_names:
|
||||||
|
raise ValueError(f"If device is an string, the value must be one of {possible_names} but got device={value}")
|
||||||
|
elif not isinstance(value, (int, str)):
|
||||||
|
raise ValueError(
|
||||||
|
f"Device must be either an integer device ID or a string (e.g., 'cpu', 'cuda:0'), but got device={value}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = None):
|
||||||
|
if value is None:
|
||||||
|
pass
|
||||||
|
elif isinstance(value, int) and value not in list(range(6)):
|
||||||
|
raise ValueError(
|
||||||
|
f"The resampling should be one of {list(range(6))} when provided as integer, but got resampling={value}"
|
||||||
|
)
|
||||||
|
elif is_vision_available() and not isinstance(value, (PILImageResampling, int)):
|
||||||
|
raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}")
|
||||||
|
|
||||||
|
|
||||||
|
def video_metadata_validator(value: Optional[VideoMetadataType] = None):
|
||||||
|
if value is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
valid_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"]
|
||||||
|
|
||||||
|
def check_dict_keys(d: dict) -> bool:
|
||||||
|
return all(key in valid_keys for key in d.keys())
|
||||||
|
|
||||||
|
if isinstance(value, Sequence) and isinstance(value[0], Sequence) and isinstance(value[0][0], dict):
|
||||||
|
for sublist in value:
|
||||||
|
for item in sublist:
|
||||||
|
if not check_dict_keys(item):
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
elif isinstance(value, Sequence) and isinstance(value[0], dict):
|
||||||
|
for item in value:
|
||||||
|
if not check_dict_keys(item):
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
if not check_dict_keys(value):
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid keys found in video metadata. Valid keys: {valid_keys}, got: {list(value.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def tensor_type_validator(value: Optional[Union[str, TensorType]] = None):
|
||||||
|
possible_names = ["pt", "np", "mlx"]
|
||||||
|
if value is None:
|
||||||
|
pass
|
||||||
|
elif not isinstance(value, str) or value not in possible_names:
|
||||||
|
raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}")
|
@ -21,6 +21,7 @@ from functools import partial
|
|||||||
from typing import Any, Callable, Optional, Union
|
from typing import Any, Callable, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from huggingface_hub.dataclasses import validate_typed_dict
|
||||||
|
|
||||||
from .dynamic_module_utils import custom_object_save
|
from .dynamic_module_utils import custom_object_save
|
||||||
from .image_processing_utils import (
|
from .image_processing_utils import (
|
||||||
@ -358,6 +359,10 @@ class BaseVideoProcessor(BaseImageProcessorFast):
|
|||||||
captured_kwargs=kwargs.keys(),
|
captured_kwargs=kwargs.keys(),
|
||||||
valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
|
valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Perform type validation on received kwargs
|
||||||
|
validate_typed_dict(self.valid_kwargs, kwargs)
|
||||||
|
|
||||||
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
||||||
# by the user, it gets its default value from the instance, or is set to None.
|
# by the user, it gets its default value from the instance, or is set to None.
|
||||||
for kwarg_name in self.valid_kwargs.__annotations__:
|
for kwarg_name in self.valid_kwargs.__annotations__:
|
||||||
|
@ -112,6 +112,11 @@ class VideoMetadata(Mapping):
|
|||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
|
||||||
|
VideoMetadataType = Union[
|
||||||
|
VideoMetadata, dict, list[Union[dict, VideoMetadata]], list[list[Union[dict, VideoMetadata]]]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def is_valid_video_frame(frame):
|
def is_valid_video_frame(frame):
|
||||||
return isinstance(frame, PIL.Image.Image) or (
|
return isinstance(frame, PIL.Image.Image) or (
|
||||||
(is_numpy_array(frame) or is_torch_tensor(frame)) and frame.ndim == 3
|
(is_numpy_array(frame) or is_torch_tensor(frame)) and frame.ndim == 3
|
||||||
@ -217,7 +222,7 @@ def make_batched_videos(videos) -> list[Union[np.ndarray, "torch.Tensor", "URL",
|
|||||||
return flat_videos_list
|
return flat_videos_list
|
||||||
|
|
||||||
|
|
||||||
def make_batched_metadata(videos: VideoInput, video_metadata: Union[VideoMetadata, dict]):
|
def make_batched_metadata(videos: VideoInput, video_metadata: VideoMetadataType) -> list[VideoMetadata]:
|
||||||
if video_metadata is None:
|
if video_metadata is None:
|
||||||
# Create default metadata and fill attributes we can infer from given video
|
# Create default metadata and fill attributes we can infer from given video
|
||||||
video_metadata = [
|
video_metadata = [
|
||||||
|
@ -176,8 +176,8 @@ class Cohere2VisionProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_inputs[0],
|
image_inputs[0],
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_last",
|
input_data_format="channels_last",
|
||||||
image_mean=0,
|
image_mean=(0.0, 0.0, 0.0, 0.0),
|
||||||
image_std=1,
|
image_std=(1.0, 1.0, 1.0, 1.0),
|
||||||
).pixel_values
|
).pixel_values
|
||||||
self.assertEqual(tuple(encoded_images.shape), (10, 4, 30, 30))
|
self.assertEqual(tuple(encoded_images.shape), (10, 4, 30, 30))
|
||||||
|
|
||||||
@ -186,7 +186,7 @@ class Cohere2VisionProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_inputs,
|
image_inputs,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_last",
|
input_data_format="channels_last",
|
||||||
image_mean=0,
|
image_mean=(0.0, 0.0, 0.0, 0.0),
|
||||||
image_std=1,
|
image_std=(1.0, 1.0, 1.0, 1.0),
|
||||||
).pixel_values
|
).pixel_values
|
||||||
self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30))
|
self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30))
|
||||||
|
@ -133,7 +133,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
||||||
"""
|
"""
|
||||||
We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor.
|
We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
|
||||||
We then check that the mean of the pixel_values is less than or equal to 0 after processing.
|
We then check that the mean of the pixel_values is less than or equal to 0 after processing.
|
||||||
Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
|
Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
|
||||||
"""
|
"""
|
||||||
@ -141,7 +141,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor_components["image_processor"] = self.get_component(
|
processor_components["image_processor"] = self.get_component(
|
||||||
"image_processor", do_rescale=True, rescale_factor=-1
|
"image_processor", do_rescale=True, rescale_factor=-1.0
|
||||||
)
|
)
|
||||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||||
|
|
||||||
@ -179,7 +179,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
|
inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt")
|
||||||
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
|
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
|
||||||
|
|
||||||
def test_unstructured_kwargs(self):
|
def test_unstructured_kwargs(self):
|
||||||
@ -194,7 +194,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
text=input_str,
|
text=input_str,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
rescale_factor=-1,
|
rescale_factor=-1.0,
|
||||||
padding="max_length",
|
padding="max_length",
|
||||||
max_length=76,
|
max_length=76,
|
||||||
)
|
)
|
||||||
@ -213,7 +213,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
images=image_input,
|
images=image_input,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
rescale_factor=-1,
|
rescale_factor=-1.0,
|
||||||
padding="longest",
|
padding="longest",
|
||||||
max_length=76,
|
max_length=76,
|
||||||
)
|
)
|
||||||
@ -231,7 +231,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = processor(
|
_ = processor(
|
||||||
images=image_input,
|
images=image_input,
|
||||||
images_kwargs={"do_rescale": True, "rescale_factor": -1},
|
images_kwargs={"do_rescale": True, "rescale_factor": -1.0},
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
@ -248,7 +248,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
# Define the kwargs for each modality
|
# Define the kwargs for each modality
|
||||||
all_kwargs = {
|
all_kwargs = {
|
||||||
"common_kwargs": {"return_tensors": "pt"},
|
"common_kwargs": {"return_tensors": "pt"},
|
||||||
"images_kwargs": {"do_rescale": True, "rescale_factor": -1},
|
"images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
|
||||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -268,7 +268,7 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
# Define the kwargs for each modality
|
# Define the kwargs for each modality
|
||||||
all_kwargs = {
|
all_kwargs = {
|
||||||
"common_kwargs": {"return_tensors": "pt"},
|
"common_kwargs": {"return_tensors": "pt"},
|
||||||
"images_kwargs": {"do_rescale": True, "rescale_factor": -1},
|
"images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
|
||||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,7 +132,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
||||||
"""
|
"""
|
||||||
We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor.
|
We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
|
||||||
We then check that the mean of the pixel_values is less than or equal to 0 after processing.
|
We then check that the mean of the pixel_values is less than or equal to 0 after processing.
|
||||||
Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
|
Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
|
||||||
"""
|
"""
|
||||||
@ -140,7 +140,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor_components["image_processor"] = self.get_component(
|
processor_components["image_processor"] = self.get_component(
|
||||||
"image_processor", do_rescale=True, rescale_factor=-1
|
"image_processor", do_rescale=True, rescale_factor=-1.0
|
||||||
)
|
)
|
||||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||||
|
|
||||||
@ -178,7 +178,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
|
inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt")
|
||||||
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
|
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
|
||||||
|
|
||||||
def test_unstructured_kwargs(self):
|
def test_unstructured_kwargs(self):
|
||||||
@ -193,7 +193,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
text=input_str,
|
text=input_str,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
rescale_factor=-1,
|
rescale_factor=-1.0,
|
||||||
padding="max_length",
|
padding="max_length",
|
||||||
max_length=76,
|
max_length=76,
|
||||||
)
|
)
|
||||||
@ -212,7 +212,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
images=image_input,
|
images=image_input,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
rescale_factor=-1,
|
rescale_factor=-1.0,
|
||||||
padding="longest",
|
padding="longest",
|
||||||
max_length=76,
|
max_length=76,
|
||||||
)
|
)
|
||||||
@ -230,7 +230,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = processor(
|
_ = processor(
|
||||||
images=image_input,
|
images=image_input,
|
||||||
images_kwargs={"do_rescale": True, "rescale_factor": -1},
|
images_kwargs={"do_rescale": True, "rescale_factor": -1.0},
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
@ -247,7 +247,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
# Define the kwargs for each modality
|
# Define the kwargs for each modality
|
||||||
all_kwargs = {
|
all_kwargs = {
|
||||||
"common_kwargs": {"return_tensors": "pt"},
|
"common_kwargs": {"return_tensors": "pt"},
|
||||||
"images_kwargs": {"do_rescale": True, "rescale_factor": -1},
|
"images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
|
||||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -267,7 +267,7 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
# Define the kwargs for each modality
|
# Define the kwargs for each modality
|
||||||
all_kwargs = {
|
all_kwargs = {
|
||||||
"common_kwargs": {"return_tensors": "pt"},
|
"common_kwargs": {"return_tensors": "pt"},
|
||||||
"images_kwargs": {"do_rescale": True, "rescale_factor": -1},
|
"images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
|
||||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -236,8 +236,8 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_inputs[0],
|
image_inputs[0],
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_last",
|
input_data_format="channels_last",
|
||||||
image_mean=0,
|
image_mean=(0.0, 0.0, 0.0, 0.0),
|
||||||
image_std=1,
|
image_std=(1.0, 1.0, 1.0, 1.0),
|
||||||
).pixel_values
|
).pixel_values
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||||
@ -247,8 +247,8 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_inputs,
|
image_inputs,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_last",
|
input_data_format="channels_last",
|
||||||
image_mean=0,
|
image_mean=(0.0, 0.0, 0.0, 0.0),
|
||||||
image_std=1,
|
image_std=(1.0, 1.0, 1.0, 1.0),
|
||||||
).pixel_values
|
).pixel_values
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||||
|
@ -250,8 +250,8 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
|||||||
video_inputs[0],
|
video_inputs[0],
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_last",
|
input_data_format="channels_last",
|
||||||
image_mean=0,
|
image_mean=(0.0, 0.0, 0.0, 0.0),
|
||||||
image_std=1,
|
image_std=(1.0, 1.0, 1.0, 1.0),
|
||||||
)[self.input_name]
|
)[self.input_name]
|
||||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
@ -261,8 +261,8 @@ class Glm4vVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
|||||||
video_inputs,
|
video_inputs,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_last",
|
input_data_format="channels_last",
|
||||||
image_mean=0,
|
image_mean=(0.0, 0.0, 0.0, 0.0),
|
||||||
image_std=1,
|
image_std=(1.0, 1.0, 1.0, 1.0),
|
||||||
)[self.input_name]
|
)[self.input_name]
|
||||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||||
|
@ -444,7 +444,7 @@ class JanusProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
tokenize=True,
|
tokenize=True,
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
rescale_factor=-1,
|
rescale_factor=-1.0,
|
||||||
return_tensors="np",
|
return_tensors="np",
|
||||||
)
|
)
|
||||||
self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
|
self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
|
||||||
|
@ -100,7 +100,7 @@ class Lfm2VlProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
"{{'<|im_start|>assistant\n' }}"
|
"{{'<|im_start|>assistant\n' }}"
|
||||||
"{% endif %}"
|
"{% endif %}"
|
||||||
)
|
)
|
||||||
return {"chat_template": chat_template, "use_image_special_tokens": True}
|
return {"chat_template": chat_template}
|
||||||
|
|
||||||
# Override as Lfm2VL needs images/video to be an explicitly nested batch
|
# Override as Lfm2VL needs images/video to be an explicitly nested batch
|
||||||
def prepare_image_inputs(self, batch_size=None):
|
def prepare_image_inputs(self, batch_size=None):
|
||||||
|
@ -386,7 +386,7 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
images=image_input,
|
images=image_input,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
do_rescale=True,
|
do_rescale=True,
|
||||||
rescale_factor=-1,
|
rescale_factor=-1.0,
|
||||||
padding="longest",
|
padding="longest",
|
||||||
max_length=76,
|
max_length=76,
|
||||||
)
|
)
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user