run doctest 2023-10-16 [PR #26627 ] torch 2.1.0 + not run idefics.md

2025-10-20 17:13:56 +08:00 · 2023-10-30 20:56:11 +01:00
4 changed files with 52 additions and 80 deletions
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -3,7 +3,7 @@ name: Doctests
 on:
  push:
    branches:
-      - doctest*
+      - run_fc639143
  repository_dispatch:
  schedule:
    - cron: "17 2 * * *"
@ -20,7 +20,7 @@ env:

 jobs:
  run_doctests:
-    runs-on: [single-gpu, nvidia-gpu, t4, doctest-ci]
+    runs-on: [single-gpu, nvidia-gpu, yih-dar-shieh-debug-doctest, doctest-ci]
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -40,6 +40,16 @@ jobs:
        run: |
          python3 utils/print_env.py

+      - name: Uninstall and reInstall torch
+        run: |
+          python3 -m pip install -U torch==2.1.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+          python3 -m pip uninstall -y accelerate && python3 -m pip install -U accelerate@git+https://github.com/huggingface/accelerate@c3ec7ff5a9e1e3bddd3be0061e6362ff1bcb08a1
+          python3 -m pip uninstall -y peft && python3 -m pip install -U peft@git+https://github.com/huggingface/peft@2464c572eba6b60a9d19ba1913fcec6bc0a2724b
+          python3 -m pip install -U alembic==1.12.0 dash==2.14.0 Flask==2.2.5 onnx==1.14.1 plac==1.4.0 plotly==5.17.0 Werkzeug==2.2.3
+          python3 -m pip uninstall -y blinker
+#          python3 -m pip install -U cmake==3.25.0
+#          python3 -m pip install -U lit==15.0.7
+
      - name: Show installed libraries and their versions
        run: pip freeze

--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@ -297,9 +297,14 @@ class Blip2PreTrainedModel(PreTrainedModel):
        elif isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, Blip2Encoder):
-            module.gradient_checkpointing = value
+    def _set_gradient_checkpointing(self, module, gradient_checkpointing_func=None):
+        if isinstance(module, (Blip2Encoder, Blip2QFormerEncoder)):
+            module.gradient_checkpointing_func = gradient_checkpointing_func
+            module.gradient_checkpointing = gradient_checkpointing_func is not None
+
+        # Enable / disable GC for the language model as well
+        if hasattr(self, "language_model") and hasattr(self.language_model, "_set_gradient_checkpointing"):
+            self.language_model._set_gradient_checkpointing(module, gradient_checkpointing_func)


 BLIP_2_START_DOCSTRING = r"""
@ -473,17 +478,11 @@ class Blip2Encoder(nn.Module):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(encoder_layer),
+                layer_outputs = self.gradient_checkpointing_func(
+                    encoder_layer.__call__,
                    hidden_states,
                    attention_mask,
+                    output_attentions,
                )
            else:
                layer_outputs = encoder_layer(
@ -944,15 +943,8 @@ class Blip2QFormerEncoder(nn.Module):
                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                    )
                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value, output_attentions, query_length)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
+                layer_outputs = self.gradient_checkpointing_func(
+                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
@ -1272,14 +1264,10 @@ class Blip2Model(Blip2PreTrainedModel):
        >>> import torch
        >>> from transformers import AutoTokenizer, Blip2Model

-        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
-
-        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
-
-        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-opt-2.7b")
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to(device)
+        >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -1333,16 +1321,12 @@ class Blip2Model(Blip2PreTrainedModel):
        >>> import requests
        >>> from transformers import AutoProcessor, Blip2Model

-        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
-
-        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
-
-        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_outputs = model.get_image_features(**inputs)
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -1381,15 +1365,12 @@ class Blip2Model(Blip2PreTrainedModel):
        >>> import requests
        >>> from transformers import Blip2Processor, Blip2Model

-        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
-
        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
-        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+        >>> inputs = processor(images=image, return_tensors="pt")
        >>> qformer_outputs = model.get_qformer_features(**inputs)
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -1654,34 +1635,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel):

        Examples:

-        Image captioning (without providing a text prompt):
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration
-        >>> import torch
-
-        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
-
-        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        >>> model = Blip2ForConditionalGeneration.from_pretrained(
-        ...     "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
-        ... )
-        >>> model.to(device)  # doctest: +IGNORE_RESULT
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
-
-        >>> generated_ids = model.generate(**inputs)
-        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-        >>> print(generated_text)
-        two cats laying on a couch
-        ```
-
-        Visual question answering (prompt = question):
+        Prepare processor, model and image input

        ```python
        >>> from PIL import Image
@ -1698,7 +1652,22 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel):

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
+        ```

+        Image captioning (without providing a text prompt):
+
+        ```python
+        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+
+        >>> generated_ids = model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two cats laying on a couch
+        ```
+
+        Visual question answering (prompt = question):
+
+        ```python
        >>> prompt = "Question: how many cats are there? Answer:"
        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16)

@ -1712,20 +1681,10 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
        This greatly reduces the amount of memory used by the model while maintaining the same performance.

        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration
-        >>> import torch
-
-        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
        >>> model = Blip2ForConditionalGeneration.from_pretrained(
-        ...     "Salesforce/blip2-flan-t5-xl", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.bfloat16
+        ...     "Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.bfloat16
        ... )  # doctest: +IGNORE_RESULT

-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> prompt = "Question: how many cats are there? Answer:"
        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)

        >>> generated_ids = model.generate(**inputs)
--- a/src/transformers/models/bros/convert_bros_to_pytorch.py
+++ b/src/transformers/models/bros/convert_bros_to_pytorch.py
@ -16,7 +16,7 @@

 import argparse

-import bros  # original repo
+# import bros  # original repo
 import torch

 from transformers import BrosConfig, BrosModel, BrosProcessor
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@ -1,3 +1,6 @@
+src/transformers/models/bros/convert_bros_to_pytorch.py
+src/transformers/models/persimmon/modeling_persimmon.py
+docs/source/en/tasks/idefics.md
 docs/source/en/_config.py
 docs/source/en/accelerate.md
 docs/source/en/add_new_model.md