mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
Remove references to AutoModelForVision2Seq (#41513)
* Since Vision2Seq is deprecated, remove it from pipelines and docstrings * Catch some more references
This commit is contained in:
@ -61,10 +61,10 @@ pipeline(image=image, question="What time is the coffee break?")
|
||||
# pip install datasets
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoProcessor, AutoModelForVision2Seq
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
|
||||
processor = AutoProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
||||
model = AutoModelForVision2Seq.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
||||
model = AutoModelForImageTextToText.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
||||
|
||||
dataset = load_dataset("hf-internal-testing/example-documents", split="test")
|
||||
image = dataset[0]["image"]
|
||||
@ -92,11 +92,11 @@ The example below uses [torchao](../quantization/torchao) to only quantize the w
|
||||
# pip install datasets torchao
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import TorchAoConfig, AutoProcessor, AutoModelForVision2Seq
|
||||
from transformers import TorchAoConfig, AutoProcessor, AutoModelForImageTextToText
|
||||
|
||||
quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
|
||||
processor = AutoProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
||||
model = AutoModelForVision2Seq.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa", quantization_config=quantization_config)
|
||||
model = AutoModelForImageTextToText.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa", quantization_config=quantization_config)
|
||||
|
||||
dataset = load_dataset("hf-internal-testing/example-documents", split="test")
|
||||
image = dataset[0]["image"]
|
||||
|
@ -39,12 +39,12 @@ import torch
|
||||
from torchvision import io
|
||||
from typing import Dict
|
||||
from transformers.image_utils import load_images, load_video
|
||||
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoProcessor
|
||||
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor
|
||||
from accelerate import Accelerator
|
||||
|
||||
device = Accelerator().device
|
||||
|
||||
model = AutoModelForVision2Seq.from_pretrained(
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
"thisisiron/Ovis2-2B-hf",
|
||||
dtype=torch.bfloat16,
|
||||
).eval().to(device)
|
||||
|
@ -1089,7 +1089,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
|
||||
>>> from PIL import Image
|
||||
>>> from io import BytesIO
|
||||
|
||||
>>> from transformers import AutoProcessor, AutoModelForVision2Seq
|
||||
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
>>> from transformers.image_utils import load_image
|
||||
|
||||
>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
|
||||
@ -1098,7 +1098,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
|
||||
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
|
||||
|
||||
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base")
|
||||
>>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b-base", device_map="auto")
|
||||
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceM4/idefics2-8b-base", device_map="auto")
|
||||
|
||||
>>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
|
||||
>>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
|
||||
|
@ -855,7 +855,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
|
||||
>>> from PIL import Image
|
||||
>>> from io import BytesIO
|
||||
|
||||
>>> from transformers import AutoProcessor, AutoModelForVision2Seq
|
||||
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
>>> from transformers.image_utils import load_image
|
||||
|
||||
>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
|
||||
@ -864,7 +864,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
|
||||
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
|
||||
|
||||
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
|
||||
>>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", dtype=torch.bfloat16, device_map="auto")
|
||||
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", dtype=torch.bfloat16, device_map="auto")
|
||||
|
||||
>>> # Create inputs
|
||||
>>> messages = [
|
||||
|
@ -118,7 +118,6 @@ if is_torch_available():
|
||||
AutoModelForTextToWaveform,
|
||||
AutoModelForTokenClassification,
|
||||
AutoModelForVideoClassification,
|
||||
AutoModelForVision2Seq,
|
||||
AutoModelForVisualQuestionAnswering,
|
||||
AutoModelForZeroShotImageClassification,
|
||||
AutoModelForZeroShotObjectDetection,
|
||||
@ -277,7 +276,7 @@ SUPPORTED_TASKS = {
|
||||
},
|
||||
"image-to-text": {
|
||||
"impl": ImageToTextPipeline,
|
||||
"pt": (AutoModelForVision2Seq,) if is_torch_available() else (),
|
||||
"pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
|
||||
"default": {"model": ("ydshieh/vit-gpt2-coco-en", "5bebf1e")},
|
||||
"type": "multimodal",
|
||||
},
|
||||
|
@ -34,7 +34,7 @@ if is_vision_available():
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
@ -42,7 +42,7 @@ logger = logging.get_logger(__name__)
|
||||
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
|
||||
class ImageToTextPipeline(Pipeline):
|
||||
"""
|
||||
Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.
|
||||
Image To Text pipeline using a `AutoModelForImageTextToText`. This pipeline predicts a caption for a given image.
|
||||
|
||||
Unless the model you're using explicitly sets these generation parameters in its configuration files
|
||||
(`generation_config.json`), the following default values will be used:
|
||||
@ -80,7 +80,7 @@ class ImageToTextPipeline(Pipeline):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
requires_backends(self, "vision")
|
||||
self.check_model_type(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
|
||||
self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
|
||||
|
||||
def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
|
||||
forward_params = {}
|
||||
|
Reference in New Issue
Block a user