Fix hidden torchvision>=0.15 dependency issue (#39928)

* use pil_torch_interpolation_mapping for NEAREST/NEAREST_EXACT * fix min torchvision version * use InterpolationMode directly * remove unused is_torchvision_greater_or_equal, * nit
2025-10-20 17:13:56 +08:00 · 2025-08-13 11:13:42 -04:00
parent 11537c3e0c
commit f445caeb0f
16 changed files with 88 additions and 28 deletions
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@ -31,6 +31,7 @@ from .utils import (
    is_torch_available,
    is_torch_tensor,
    is_torchvision_available,
+    is_torchvision_v2_available,
    is_vision_available,
    logging,
    requires_backends,
@ -59,7 +60,9 @@ if is_vision_available():
        from torchvision.transforms import InterpolationMode

        pil_torch_interpolation_mapping = {
-            PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT,
+            PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else InterpolationMode.NEAREST,
            PILImageResampling.BOX: InterpolationMode.BOX,
            PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
            PILImageResampling.HAMMING: InterpolationMode.HAMMING,
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
@ -57,6 +57,7 @@ if is_torch_available():
 if is_torchvision_v2_available():
    from torchvision.io import read_image
    from torchvision.transforms.v2 import functional as F
+
 elif is_torchvision_available():
    from torchvision.io import read_image
    from torchvision.transforms import functional as F
@ -454,10 +455,16 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
                The target size of the image, as returned by the preprocessing `resize` step.
            threshold (`float`, *optional*, defaults to 0.5):
                The threshold used to binarize the segmentation masks.
-            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST_EXACT`):
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
                The resampling filter to use when resizing the masks.
        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]

        new_annotation = {}
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@ -48,6 +48,7 @@ if is_torch_available():
 if is_torchvision_v2_available():
    from torchvision.io import read_image
    from torchvision.transforms.v2 import functional as F
+
 elif is_torchvision_available():
    from torchvision.io import read_image
    from torchvision.transforms import functional as F
@ -445,10 +446,16 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
                The target size of the image, as returned by the preprocessing `resize` step.
            threshold (`float`, *optional*, defaults to 0.5):
                The threshold used to binarize the segmentation masks.
-            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST_EXACT`):
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
                The resampling filter to use when resizing the masks.
        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]

        new_annotation = {}
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@ -70,6 +70,7 @@ if is_vision_available():
 if is_torchvision_v2_available():
    from torchvision.io import read_image
    from torchvision.transforms.v2 import functional as F
+
 elif is_torchvision_available():
    from torchvision.io import read_image
    from torchvision.transforms import functional as F
@ -466,10 +467,16 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
                The target size of the image, as returned by the preprocessing `resize` step.
            threshold (`float`, *optional*, defaults to 0.5):
                The threshold used to binarize the segmentation masks.
-            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST_EXACT`):
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
                The resampling filter to use when resizing the masks.
        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]

        new_annotation = {}
--- a/src/transformers/models/eomt/image_processing_eomt_fast.py
+++ b/src/transformers/models/eomt/image_processing_eomt_fast.py
@ -33,7 +33,6 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    SizeDict,
-    pil_torch_interpolation_mapping,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@ -210,7 +209,9 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
                    "do_normalize": False,
                    "do_rescale": False,
                    # Nearest interpolation is used for segmentation maps instead of BILINEAR.
-                    "interpolation": pil_torch_interpolation_mapping[PILImageResampling.NEAREST],
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT
+                    if is_torchvision_v2_available()
+                    else F.InterpolationMode.NEAREST,
                }
            )

--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
@ -51,6 +51,7 @@ if is_torch_available():
 if is_torchvision_v2_available():
    from torchvision.io import read_image
    from torchvision.transforms.v2 import functional as F
+
 elif is_torchvision_available():
    from torchvision.io import read_image
    from torchvision.transforms import functional as F
@ -476,10 +477,16 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
                The target size of the image, as returned by the preprocessing `resize` step.
            threshold (`float`, *optional*, defaults to 0.5):
                The threshold used to binarize the segmentation masks.
-            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST_EXACT`):
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
                The resampling filter to use when resizing the masks.
        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]

        new_annotation = {}
--- a/src/transformers/models/mask2former/image_processing_mask2former_fast.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former_fast.py
@ -62,6 +62,7 @@ if is_torch_available():

 if is_torchvision_v2_available():
    from torchvision.transforms.v2 import functional as F
+
 elif is_torchvision_available():
    from torchvision.transforms import functional as F

@ -369,7 +370,9 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
                        image=grouped_segmentation_maps[shape],
                        size=size,
                        size_divisor=size_divisor,
-                        interpolation=F.InterpolationMode.NEAREST_EXACT,
+                        interpolation=F.InterpolationMode.NEAREST_EXACT
+                        if is_torchvision_v2_available()
+                        else F.InterpolationMode.NEAREST,
                    )
            resized_images_grouped[shape] = stacked_images
            if segmentation_maps is not None:
--- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py
@ -66,6 +66,7 @@ if is_torch_available():

 if is_torchvision_v2_available():
    from torchvision.transforms.v2 import functional as F
+
 elif is_torchvision_available():
    from torchvision.transforms import functional as F

@ -370,7 +371,9 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
                        image=grouped_segmentation_maps[shape],
                        size=size,
                        size_divisor=size_divisor,
-                        interpolation=F.InterpolationMode.NEAREST_EXACT,
+                        interpolation=F.InterpolationMode.NEAREST_EXACT
+                        if is_torchvision_v2_available()
+                        else F.InterpolationMode.NEAREST,
                    )
            resized_images_grouped[shape] = stacked_images
            if segmentation_maps is not None:
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py
@ -31,7 +31,6 @@ from ...image_utils import (
    PILImageResampling,
    SizeDict,
    is_torch_tensor,
-    pil_torch_interpolation_mapping,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@ -139,7 +138,9 @@ class MobileNetV2ImageProcessorFast(BaseImageProcessorFast):
                    "do_normalize": False,
                    "do_rescale": False,
                    # Nearest interpolation is used for segmentation maps instead of BILINEAR.
-                    "interpolation": pil_torch_interpolation_mapping[PILImageResampling.NEAREST],
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT
+                    if is_torchvision_v2_available()
+                    else F.InterpolationMode.NEAREST,
                }
            )

--- a/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py
@ -29,7 +29,6 @@ from ...image_utils import (
    PILImageResampling,
    SizeDict,
    is_torch_tensor,
-    pil_torch_interpolation_mapping,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@ -140,7 +139,9 @@ class MobileViTImageProcessorFast(BaseImageProcessorFast):
                    "do_rescale": False,
                    "do_flip_channel_order": False,
                    # Nearest interpolation is used for segmentation maps instead of BILINEAR.
-                    "interpolation": pil_torch_interpolation_mapping[PILImageResampling.NEAREST],
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT
+                    if is_torchvision_v2_available()
+                    else F.InterpolationMode.NEAREST,
                }
            )

--- a/src/transformers/models/oneformer/image_processing_oneformer_fast.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer_fast.py
@ -457,7 +457,11 @@ class OneFormerImageProcessorFast(BaseImageProcessorFast):
            for shape, stacked_segmentation_maps in grouped_segmentation_maps.items():
                if do_resize:
                    stacked_segmentation_maps = self.resize(
-                        stacked_segmentation_maps, size=size, interpolation=F.InterpolationMode.NEAREST_EXACT
+                        stacked_segmentation_maps,
+                        size=size,
+                        interpolation=F.InterpolationMode.NEAREST_EXACT
+                        if is_torchvision_v2_available()
+                        else F.InterpolationMode.NEAREST,
                    )
                processed_segmentation_maps_grouped[shape] = stacked_segmentation_maps
            processed_segmentation_maps = reorder_images(
--- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@ -264,10 +264,16 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
                The target size of the image, as returned by the preprocessing `resize` step.
            threshold (`float`, *optional*, defaults to 0.5):
                The threshold used to binarize the segmentation masks.
-            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST_EXACT`):
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
                The resampling filter to use when resizing the masks.
        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]

        new_annotation = {}
--- a/src/transformers/models/sam/image_processing_sam_fast.py
+++ b/src/transformers/models/sam/image_processing_sam_fast.py
@ -36,7 +36,6 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    SizeDict,
-    pil_torch_interpolation_mapping,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@ -244,7 +243,9 @@ class SamImageProcessorFast(BaseImageProcessorFast):
                {
                    "do_normalize": False,
                    "do_rescale": False,
-                    "interpolation": pil_torch_interpolation_mapping[PILImageResampling.NEAREST],
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT
+                    if is_torchvision_v2_available()
+                    else F.InterpolationMode.NEAREST,
                    "size": segmentation_maps_kwargs.pop("mask_size"),
                    "pad_size": segmentation_maps_kwargs.pop("mask_pad_size"),
                }
--- a/src/transformers/models/segformer/image_processing_segformer_fast.py
+++ b/src/transformers/models/segformer/image_processing_segformer_fast.py
@ -36,7 +36,6 @@ from ...image_utils import (
    PILImageResampling,
    SizeDict,
    is_torch_tensor,
-    pil_torch_interpolation_mapping,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@ -143,7 +142,9 @@ class SegformerImageProcessorFast(BaseImageProcessorFast):
                    "do_normalize": False,
                    "do_rescale": False,
                    # Nearest interpolation is used for segmentation maps instead of BILINEAR.
-                    "interpolation": pil_torch_interpolation_mapping[PILImageResampling.NEAREST],
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT
+                    if is_torchvision_v2_available()
+                    else F.InterpolationMode.NEAREST,
                }
            )
            processed_segmentation_maps = self._preprocess(
--- a/src/transformers/models/segformer/modular_segformer.py
+++ b/src/transformers/models/segformer/modular_segformer.py
@ -30,7 +30,6 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    SizeDict,
-    pil_torch_interpolation_mapping,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@ -100,7 +99,9 @@ class SegformerImageProcessorFast(BeitImageProcessorFast):
                    "do_normalize": False,
                    "do_rescale": False,
                    # Nearest interpolation is used for segmentation maps instead of BILINEAR.
-                    "interpolation": pil_torch_interpolation_mapping[PILImageResampling.NEAREST],
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT
+                    if is_torchvision_v2_available()
+                    else F.InterpolationMode.NEAREST,
                }
            )
            processed_segmentation_maps = self._preprocess(
--- a/src/transformers/models/yolos/image_processing_yolos_fast.py
+++ b/src/transformers/models/yolos/image_processing_yolos_fast.py
@ -47,6 +47,7 @@ if is_torch_available():
 if is_torchvision_v2_available():
    from torchvision.io import read_image
    from torchvision.transforms.v2 import functional as F
+
 elif is_torchvision_available():
    from torchvision.io import read_image
    from torchvision.transforms import functional as F
@ -493,10 +494,16 @@ class YolosImageProcessorFast(BaseImageProcessorFast):
                The target size of the image, as returned by the preprocessing `resize` step.
            threshold (`float`, *optional*, defaults to 0.5):
                The threshold used to binarize the segmentation masks.
-            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST_EXACT`):
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
                The resampling filter to use when resizing the masks.
        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]

        new_annotation = {}