Remove references to AutoModelForVision2Seq (#41513)

* Since Vision2Seq is deprecated, remove it from pipelines and docstrings

* Catch some more references
This commit is contained in:
Matt
2025-10-13 17:00:07 +01:00
committed by GitHub
parent 1ee3b288a6
commit b84c0b31c6
6 changed files with 14 additions and 15 deletions

View File

@ -61,10 +61,10 @@ pipeline(image=image, question="What time is the coffee break?")
# pip install datasets
import torch
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers import AutoProcessor, AutoModelForImageTextToText
processor = AutoProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = AutoModelForVision2Seq.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = AutoModelForImageTextToText.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
dataset = load_dataset("hf-internal-testing/example-documents", split="test")
image = dataset[0]["image"]
@ -92,11 +92,11 @@ The example below uses [torchao](../quantization/torchao) to only quantize the w
# pip install datasets torchao
import torch
from datasets import load_dataset
from transformers import TorchAoConfig, AutoProcessor, AutoModelForVision2Seq
from transformers import TorchAoConfig, AutoProcessor, AutoModelForImageTextToText
quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
processor = AutoProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = AutoModelForVision2Seq.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa", quantization_config=quantization_config)
model = AutoModelForImageTextToText.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa", quantization_config=quantization_config)
dataset = load_dataset("hf-internal-testing/example-documents", split="test")
image = dataset[0]["image"]

View File

@ -39,12 +39,12 @@ import torch
from torchvision import io
from typing import Dict
from transformers.image_utils import load_images, load_video
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoProcessor
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor
from accelerate import Accelerator
device = Accelerator().device
model = AutoModelForVision2Seq.from_pretrained(
model = AutoModelForImageTextToText.from_pretrained(
"thisisiron/Ovis2-2B-hf",
dtype=torch.bfloat16,
).eval().to(device)

View File

@ -1089,7 +1089,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
>>> from PIL import Image
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AutoModelForVision2Seq
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> from transformers.image_utils import load_image
>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
@ -1098,7 +1098,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base")
>>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b-base", device_map="auto")
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceM4/idefics2-8b-base", device_map="auto")
>>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
>>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]

View File

@ -855,7 +855,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
>>> from PIL import Image
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AutoModelForVision2Seq
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> from transformers.image_utils import load_image
>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
@ -864,7 +864,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
>>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", dtype=torch.bfloat16, device_map="auto")
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", dtype=torch.bfloat16, device_map="auto")
>>> # Create inputs
>>> messages = [

View File

@ -118,7 +118,6 @@ if is_torch_available():
AutoModelForTextToWaveform,
AutoModelForTokenClassification,
AutoModelForVideoClassification,
AutoModelForVision2Seq,
AutoModelForVisualQuestionAnswering,
AutoModelForZeroShotImageClassification,
AutoModelForZeroShotObjectDetection,
@ -277,7 +276,7 @@ SUPPORTED_TASKS = {
},
"image-to-text": {
"impl": ImageToTextPipeline,
"pt": (AutoModelForVision2Seq,) if is_torch_available() else (),
"pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
"default": {"model": ("ydshieh/vit-gpt2-coco-en", "5bebf1e")},
"type": "multimodal",
},

View File

@ -34,7 +34,7 @@ if is_vision_available():
if is_torch_available():
import torch
from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
logger = logging.get_logger(__name__)
@ -42,7 +42,7 @@ logger = logging.get_logger(__name__)
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
class ImageToTextPipeline(Pipeline):
"""
Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.
Image To Text pipeline using a `AutoModelForImageTextToText`. This pipeline predicts a caption for a given image.
Unless the model you're using explicitly sets these generation parameters in its configuration files
(`generation_config.json`), the following default values will be used:
@ -80,7 +80,7 @@ class ImageToTextPipeline(Pipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
requires_backends(self, "vision")
self.check_model_type(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
forward_params = {}