no changes

Merge branch 'main' into test-bart-dummy
test
2025-10-28 23:34:35 +08:00 · 2025-10-24 10:47:55 +02:00 · 2025-10-24 10:31:20 +02:00 · 2025-10-23 20:23:42 +02:00 · 2025-10-23 20:17:07 +02:00
42 changed files with 486 additions and 642 deletions
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -28,9 +28,6 @@ on:
      report_repo_id:
        required: false
        type: string
-      pytest_marker:
-        required: false
-        type: string

 env:
  HF_HOME: /mnt/cache
@ -140,7 +137,7 @@ jobs:
      - name: Run all tests on GPU
        working-directory: /transformers
        run: |
-          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
+          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
          ls -la
          # Extract the exit code from the output file
          EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
--- a/.github/workflows/self-scheduled-flash-attn-caller.yml
+++ b/.github/workflows/self-scheduled-flash-attn-caller.yml
@ -1,60 +0,0 @@
-name: Nvidia CI - Flash Attn
-
-on:
-  repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
-  push:
-    branches:
-      - run_nvidia_ci_flash_attn*
-  workflow_dispatch:
-    inputs:
-      prev_workflow_run_id:
-        description: 'previous workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-      other_workflow_run_id:
-        description: 'other workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-
-
-# Used for `push` to easily modify the target workflow runs to compare against
-env:
-    prev_workflow_run_id: ""
-    other_workflow_run_id: ""
-
-
-jobs:
-  setup:
-    name: Setup
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Setup
-        run: |
-          mkdir "setup_values"
-          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
-          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: setup_values
-          path: setup_values
-
-
-  model-ci:
-    name: Model CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-flash-attn"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      runner_type: "a10"
-      report_repo_id: hf-internal-testing/transformers_flash_attn_ci
-      commit_sha: ${{ github.sha }}
-      pytest_marker: "flash_attn_test or flash_attn_3_test"
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -38,10 +38,6 @@ on:
        default: ""
        required: false
        type: string
-      pytest_marker:
-        required: false
-        type: string
-

 env:
  HF_HOME: /mnt/cache
@ -131,7 +127,6 @@ jobs:
      commit_sha: ${{ inputs.commit_sha || github.sha }}
      runner_type: ${{ inputs.runner_type }}
      report_repo_id: ${{ inputs.report_repo_id }}
-      pytest_marker: ${{ inputs.pytest_marker }}
    secrets: inherit

  run_trainer_and_fsdp_gpu:
--- a/AGENTS.md
+++ b/AGENTS.md
@ -14,7 +14,7 @@ This AGENTS.md file provides guidance for code agents working with this codebase

 - PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
 - When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
- Code style is enforced in the CI. You can install the style tools with `pip install -e ".[quality]"`. You can then run `make fixup` to apply style and consistency fixes to your code.
+- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.

 ## Copying and inheritance

@ -36,4 +36,4 @@ After making changes, you should usually run `make fixup` to ensure any copies a
 the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
 If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.

-In order to run tests, you may need to install dependencies. You can do this with `pip install -e ".[testing]"`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
+In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -9,12 +9,6 @@ In this list, we showcase incredibly impactful and novel projects that have push
 adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
 to add it.

-## [◉ Universal Intelligence](https://github.com/blueraai/universal-intelligence)
-
-[Universal Intelligence](https://github.com/blueraai/universal-intelligence) aims to standardize models, tools, and agents —transforming them into simple, composable, portable, interoperable, framework-agnostic, hardware-agnostic interfaces (through auto-negotiation and resource sharing); for fast and accessible development of AI applications.
-
-Keywords: Protocol, Open-source, LLMs, Large Language Models, Agents, Low-code
-
 ## [gpt4all](https://github.com/nomic-ai/gpt4all)

 [gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.
--- a/docs/source/en/continuous_batching.md
+++ b/docs/source/en/continuous_batching.md
@ -1,235 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
---
-# Brainstorm
-
-## Persona
-
-Model developer that wants to evaluate his model implementation on a dataset or a model "trainer" that wants to run inference for his GRPO policy.
-Pre reqs to understand the docs:
- knows what KV Cache is
- familiarity with transformers and infernece
-
-## what we want do include in the doc
-
- [ ] CB usage examples
- [ ] CB API reference
- [x] light refresher on what is CB + links to blog post
-
- [x] installation / setup instructions
-
- [x] open telemetry support
-
- [ ] subsection in Transformers > Inference
-
- [x] supported & unsupported features
-
- [ ] performance considerations
-  - [ ] note on benchmarks (CI + space)
-  - [ ] cuda graphs
-  - [ ] compile
-  - [ ] attn impl
-
- [x] explicit intended use cases, the why of CB in transformers
-
- [x] integration with serving
---
-
-
-# Continuous Batching
-
-Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
-
-We are particularly interested in having Continuous Batching in transformers for the following use cases:
- Evaluation of models on large datasets with variable-length inputs
- Generating outputs for multiple sequences for GRPO policies
-
-CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
-
-If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
-
-## Installation
-
-Nothing to do, it comes built-in with `transformers`! :nice:
-
-## API Reference
-
-## Usage Examples
-
-The main way to use CB in transformers is via the `generate_batch` method.
-
-Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since 
-
-For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
-
-### `generate_batch` example
-
-We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
-
-This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
-
-You can use it as follows:
-
-```py
-import datasets
-import torch
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
-
-model = AutoModelForCausalLM.from_pretrained(
-    "Qwen/Qwen3-4B-Instruct-2507",
-    attn_implementation="spda_paged",
-    device_map="cuda",  # if you need cuda
-    dtype=torch.bfloat16,
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
-
-# prepare a batch of inputs
-dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
-dataset = dataset.select(range(args.samples))
-tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
-simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
-
-generation_config = GenerationConfig(
-    max_new_tokens=32,
-    use_cuda_graph=False,  # Not supported for simple version
-    eos_token_id=tokenizer.eos_token_id,
-    pad_token_id=tokenizer.pad_token_id,
-    do_sample=False,
-    max_batch_tokens=512,  # max number of tokens in a batch, this is just a default value you should tune based on your hardware
-)
-
-batch_outputs = model.generate_batch(
-    inputs=simple_batch_inputs,
-    generation_config=generation_config,
-)
-
-for request_id, output in batch_outputs.items():
-    generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
-    print(f"Request {request_id} output: {generated_text}")
-```
-
-### `ContinuousBatchingManager` example
-
-If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
-
-This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
-
-Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
-
-Note that the manager is thread safe!
-
-```py
-import datasets
-import torch
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
-from transformers.generation.continuous_batching import RequestStatus
-
-model = AutoModelForCausalLM.from_pretrained(
-    "Qwen/Qwen3-4B-Instruct-2507",
-    attn_implementation="spda_paged",
-    device_map="cuda",  # if you need cuda
-    dtype=torch.bfloat16,
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
-
-# prepare a batch of inputs
-dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
-dataset = dataset.select(range(args.samples))
-tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
-simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
-
-# initialize the manager, available method thanks to the `ContinuousMixin`
-manager = model.init_continuous_batching(generation_config=generation_config)
-
-# start the background thread
-manager.start()
-
-# this is for demonstration purposes only, in practice this is most useful to do concurrently
-for i, input in enumerate(simple_batch_inputs):
-    request_id = manager.add_request(input_ids=input, request_id=f"request_{i}")  # if you do not specify a request_id, one will be generated for you
-
-# Can be done in an other thread
-for id, request in manager.get_result():
-    generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
-    print(f"Request {id} output: {generated_text}")
-
-# you can also get results for a specific request id
-result = manager.get_result(request_id="request_5")  # this is blocking and will wait for the result to be ready
-
-# or get results for a request that is streaming
-manager.add_request(
-    input_ids=input,
-    request_id="streaming_request",
-    stream=True,
-)
-for chunk in manager.request_id_iter(request_id="streaming_request"):
-    generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
-    print(generated_text)
-    # FIXME: stop iteration in `request_id_iter` when finished instead of doing it externeally
-    if chunk.status == RequestStatus.FINISHED:
-        break
-
-# stop the background thread before exiting the process
-manager.stop()
-```
-
-## Supported & Unsupported Features
-
-### Supported Features
-
- Dynamic scheduling of variable-length requests
- Chunked prefill
- Paged Attention Cache
- Sliding window attention
- Chat templates
-
-### Unsupported Features
-
- Prefix caching
- Beam search
- speculative (including assisted) decoding
- tool calling
- MTP (multi token prediction)
- Medusa
- anything related to `do_sample`
-
-Note that these aren't supported, at the moment. Some features like prefix caching, beam search or tool calling are on our roadmap and others will be best effort!
-
-Please do let us know if you'd like to see support for any of these features!
-
-## Performance Considerations
-
-
-## Integration with Serving
-
-You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
-
-## Monitoring
-
-We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
-
-```sh
-# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
-pip install transformers[open-telemetry]
-```
-
-This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.
-
--- a/docs/source/en/pr_checks.md
+++ b/docs/source/en/pr_checks.md
@ -38,7 +38,7 @@ pip install transformers[dev]
 or for an editable install:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install PyTorch then do
@ -50,7 +50,7 @@ pip install transformers[quality]
 or for an editable install:

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```

 ## Tests
--- a/docs/source/es/pr_checks.md
+++ b/docs/source/es/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 o una instalación editable:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 del repositorio de Transformers.
--- a/docs/source/it/pr_checks.md
+++ b/docs/source/it/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 o un'installazione modificabile:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 all'interno del repo Transformers.
--- a/docs/source/ja/pr_checks.md
+++ b/docs/source/ja/pr_checks.md
@ -40,7 +40,7 @@ pip install transformers[dev]


 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 トランスフォーマーズのリポジトリ内で作業しています。トランスフォーマーズのオプションの依存関係の数が増えたため、すべてを取得できない可能性があります。開発用インストールが失敗した場合、作業しているディープラーニングフレームワーク（PyTorch、TensorFlow、および/またはFlax）をインストールし、次の手順を実行してください。
@ -53,7 +53,7 @@ pip install transformers[quality]
 または編集可能なインストールの場合：

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```

 ## Tests
--- a/docs/source/ko/pr_checks.md
+++ b/docs/source/ko/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 또는 Transformers 저장소 내에 편집 가능한 설치가 필요합니다:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 Transformers의 선택적 종속성 수가 많이 늘어났기 때문에 개발 설치를 실패할 수도 있습니다. 개발 설치가 실패하는 경우, 작업 중인 Deep Learning 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)를 설치하고 다음 명령을 실행하세요.
@ -49,7 +49,7 @@ pip install transformers[quality]
 편집 가능한 설치의 경우는 다음 명령을 실행하세요.

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```


--- a/setup.py
+++ b/setup.py
@ -392,7 +392,6 @@ extras["torchhub"] = deps_list(
 extras["benchmark"] = deps_list("optimum-benchmark")

 # OpenTelemetry dependencies for metrics collection in continuous batching
-# TODO: refactor this to split API and SDK; SDK and exporter should only be needed to run code that collects metrics whereas API is what people will need to instrument their code and handle exporter themselves
 extras["open-telemetry"] = deps_list("opentelemetry-api") + ["opentelemetry-exporter-otlp", "opentelemetry-sdk"]

 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@ -919,7 +919,6 @@ class ContinuousBatchingManager:
            if result is not None:
                yield result

-    # FIXME: stop iteration when request status is finished?
    def request_id_iter(self, request_id: str) -> Generator[GenerationOutput]:
        """Iterate over results matching a specific request id as they become available."""
        request_cancelled = False
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@ -628,7 +628,7 @@ def maybe_load_adapters(
    **adapter_kwargs,
 ):
    if pretrained_model_name_or_path is None or not is_peft_available():
-        return None, pretrained_model_name_or_path, adapter_kwargs
+        return None, pretrained_model_name_or_path

    token = download_kwargs.get("token")

@ -670,4 +670,4 @@ def maybe_load_adapters(
            _adapter_model_path = pretrained_model_name_or_path
            pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]

-    return _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs
+    return _adapter_model_path, pretrained_model_name_or_path
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -4353,7 +4353,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        if adapter_kwargs is None:
            adapter_kwargs = {}

-        _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs = maybe_load_adapters(
+        _adapter_model_path, pretrained_model_name_or_path = maybe_load_adapters(
            pretrained_model_name_or_path,
            download_kwargs_with_commit,
            **adapter_kwargs,
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@ -538,12 +538,12 @@ class BartEncoder(BartPreTrainedModel):
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

+        self.embed_tokens = BartScaledWordEmbedding(
+            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = BartScaledWordEmbedding(
-                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
@ -682,12 +682,12 @@ class BartDecoder(BartPreTrainedModel):
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

+        self.embed_tokens = BartScaledWordEmbedding(
+            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = BartScaledWordEmbedding(
-                config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@ -22,7 +22,7 @@ import torch
 from torch import nn

 from ...activations import ACT2FN
-from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -310,6 +310,7 @@ class CLIPAttention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
@ -323,6 +324,15 @@ class CLIPAttention(nn.Module):
        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
+        else:
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@ -334,12 +344,13 @@ class CLIPAttention(nn.Module):
            keys,
            values,
            attention_mask,
+            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
            **kwargs,
        )

-        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights
@ -373,14 +384,16 @@ class CLIPEncoderLayer(GradientCheckpointingLayer):
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.FloatTensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )
        hidden_states = residual + hidden_states
@ -484,6 +497,7 @@ class CLIPEncoder(nn.Module):
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutput:
        r"""
@ -498,6 +512,13 @@ class CLIPEncoder(nn.Module):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
                [What are attention masks?](../glossary#attention-mask)
        """
        hidden_states = inputs_embeds
@ -505,6 +526,7 @@ class CLIPEncoder(nn.Module):
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
+                causal_attention_mask,
                **kwargs,
            )

@ -541,19 +563,17 @@ class CLIPTextTransformer(nn.Module):

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
-            past_key_values=None,
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
        )

-        kwargs.pop("is_causal", None)
+        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
-            is_causal=True,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )

@ -598,6 +618,7 @@ class CLIPTextModel(CLIPPreTrainedModel):
    input_modalities = "text"

    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: CLIPTextConfig):
        super().__init__(config)
@ -611,7 +632,8 @@ class CLIPTextModel(CLIPPreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -704,6 +726,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -743,6 +766,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
 class CLIPModel(CLIPPreTrainedModel):
    config: CLIPConfig
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: CLIPConfig):
        super().__init__(config)
@ -942,6 +966,7 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
    config: CLIPTextConfig
    input_modalities = "text"

+    _supports_flash_attn = False
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

    def __init__(self, config: CLIPTextConfig):
@ -961,7 +986,8 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -1023,6 +1049,7 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -1090,7 +1117,8 @@ class CLIPForImageClassification(CLIPPreTrainedModel):
        # Initialize weights and apply final processing
        self.post_init()

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@ -1392,7 +1392,7 @@ class Emu3Model(Emu3PreTrainedModel):
        image_features = torch.split(image_features, split_sizes)
        return image_features

-    @torch.no_grad()
+    @torch.no_grad
    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
        """
        Decodes generated image tokens from language model to continuous pixel values
--- a/src/transformers/models/emu3/modular_emu3.py
+++ b/src/transformers/models/emu3/modular_emu3.py
@ -946,7 +946,7 @@ class Emu3Model(Emu3PreTrainedModel):
        image_features = torch.split(image_features, split_sizes)
        return image_features

-    @torch.no_grad()
+    @torch.no_grad
    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
        """
        Decodes generated image tokens from language model to continuous pixel values
--- a/src/transformers/models/janus/modeling_janus.py
+++ b/src/transformers/models/janus/modeling_janus.py
@ -1283,7 +1283,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
        decoded_image = decoded_image.permute(0, 2, 3, 1)
        return decoded_image

-    @torch.no_grad()
+    @torch.no_grad
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@ -1099,7 +1099,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
        decoded_image = decoded_image.permute(0, 2, 3, 1)
        return decoded_image

-    @torch.no_grad()
+    @torch.no_grad
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
--- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py
+++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
@ -12,7 +12,7 @@ import torch
 from torch import nn

 from ...activations import ACT2FN
-from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -200,6 +200,7 @@ class MetaClip2Attention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
@ -213,6 +214,15 @@ class MetaClip2Attention(nn.Module):
        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        # METACLIP_2 text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
+        else:
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@ -224,12 +234,13 @@ class MetaClip2Attention(nn.Module):
            keys,
            values,
            attention_mask,
+            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
            **kwargs,
        )

-        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights
@ -263,14 +274,16 @@ class MetaClip2EncoderLayer(GradientCheckpointingLayer):
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.FloatTensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )
        hidden_states = residual + hidden_states
@ -374,6 +387,7 @@ class MetaClip2Encoder(nn.Module):
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutput:
        r"""
@ -388,6 +402,13 @@ class MetaClip2Encoder(nn.Module):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
                [What are attention masks?](../glossary#attention-mask)
        """
        hidden_states = inputs_embeds
@ -395,6 +416,7 @@ class MetaClip2Encoder(nn.Module):
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
+                causal_attention_mask,
                **kwargs,
            )

@ -415,12 +437,14 @@ class MetaClip2TextTransformer(nn.Module):
        # For `pooled_output` computation
        self.eos_token_id = config.eos_token_id

+    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        input_ids,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        input_shape = input_ids.size()
@ -428,19 +452,21 @@ class MetaClip2TextTransformer(nn.Module):

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
-            past_key_values=None,
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
        )

-        kwargs.pop("is_causal", None)
+        # expand attention_mask
+        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
-            is_causal=True,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )

@ -501,6 +527,7 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel):
    input_modalities = "text"

    _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: MetaClip2TextConfig):
        super().__init__(config)
@ -514,13 +541,16 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        r"""
@ -600,6 +630,7 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel):
    config: MetaClip2TextConfig
    input_modalities = "text"

+    _supports_flash_attn = False
    _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"]

    def __init__(self, config: MetaClip2TextConfig):
@ -619,13 +650,16 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> MetaClip2TextModelOutput:
        r"""
@ -758,6 +792,7 @@ class MetaClip2Model(MetaClip2PreTrainedModel):

    config: MetaClip2Config
    _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer", "MetaClip2VisionEmbeddings"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: MetaClip2Config):
        super().__init__(config)
@ -1043,7 +1078,7 @@ class MetaClip2VisionModel(MetaClip2PreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
+    @can_return_tuple
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
@ -1152,6 +1187,7 @@ class MetaClip2VisionModelWithProjection(MetaClip2PreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -1218,7 +1254,8 @@ class MetaClip2ForImageClassification(MetaClip2PreTrainedModel):
        # Initialize weights and apply final processing
        self.post_init()

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
--- a/src/transformers/models/metaclip_2/modular_metaclip_2.py
+++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py
@ -3,8 +3,9 @@ from typing import Optional
 import torch
 from torch import nn

-from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
 from ...utils.generic import check_model_inputs
@ -12,9 +13,9 @@ from ..clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConf
 from ..clip.modeling_clip import (
    CLIPMLP,
    CLIPAttention,
+    CLIPEncoderLayer,
    CLIPForImageClassification,
    CLIPModel,
-    CLIPPreTrainedModel,
    CLIPTextEmbeddings,
    CLIPTextModel,
    CLIPTextModelWithProjection,
@ -213,9 +214,24 @@ class MetaClip2MLP(CLIPMLP):
    pass


+class MetaClip2EncoderLayer(CLIPEncoderLayer):
+    pass
+
+
@auto_docstring
-class MetaClip2PreTrainedModel(CLIPPreTrainedModel):
+class MetaClip2PreTrainedModel(PreTrainedModel):
+    config: MetaClip2Config
    base_model_prefix = "metaclip_2"
+    input_modalities = ["image", "text"]
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": MetaClip2EncoderLayer,
+        "attentions": MetaClip2Attention,
+    }

    def _init_weights(self, module):
        """Initialize the weights"""
@ -275,12 +291,14 @@ class MetaClip2PreTrainedModel(CLIPPreTrainedModel):


 class MetaClip2TextTransformer(CLIPTextTransformer):
+    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        input_ids,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        input_shape = input_ids.size()
@ -288,19 +306,21 @@ class MetaClip2TextTransformer(CLIPTextTransformer):

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
-            past_key_values=None,
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
        )

-        kwargs.pop("is_causal", None)
+        # expand attention_mask
+        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
-            is_causal=True,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )

@ -352,13 +372,22 @@ class MetaClip2TextModel(CLIPTextModel):
    >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
    ```"""

-    @check_model_inputs(tie_last_hidden_states=False)
+    def __init__(self, config: MetaClip2TextConfig):
+        super().__init__(config)
+        self.text_model = MetaClip2TextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ):
        r"""
@ -380,6 +409,8 @@ class MetaClip2TextModel(CLIPTextModel):
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            **kwargs,
        )

@ -415,13 +446,24 @@ class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection):
    >>> text_embeds = outputs.text_embeds
    ```"""

-    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
+    def __init__(self, config: MetaClip2TextConfig):
+        super().__init__(config)
+
+        text_model = MetaClip2TextModel._from_config(config)
+        self.text_model = text_model.text_model
+
+        self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ):
        r"""
@ -442,6 +484,8 @@ class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection):
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            **kwargs,
        )

@ -506,8 +550,6 @@ class MetaClip2Model(CLIPModel):
        # Initialize weights and apply final processing
        self.post_init()

-    @can_return_tuple
-    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
@ -652,7 +694,7 @@ class MetaClip2VisionModel(CLIPVisionModel):
    ```"""

    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
+    @can_return_tuple
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
@ -722,8 +764,6 @@ class MetaClip2VisionModelWithProjection(CLIPVisionModelWithProjection):
    >>> image_embeds = outputs.image_embeds
    ```"""

-    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
--- a/src/transformers/models/mlcd/modeling_mlcd.py
+++ b/src/transformers/models/mlcd/modeling_mlcd.py
@ -25,12 +25,12 @@ import torch
 import torch.nn as nn

 from ...activations import ACT2FN
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, torch_int
-from ...utils.generic import check_model_inputs
 from .configuration_mlcd import MLCDVisionConfig


@ -259,7 +259,7 @@ class MLCDAttention(nn.Module):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
        batch_size, seq_length = hidden_states.shape[:-1]
@ -316,7 +316,7 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.FloatTensor]:
        """
        Args:
@ -328,15 +328,18 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
-            **kwargs,
+            output_attentions=output_attentions,
        )
        hidden_states = residual + hidden_states

@ -345,7 +348,12 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

-        return hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs


 class MLCDEncoder(nn.Module):
@ -369,7 +377,9 @@ class MLCDEncoder(nn.Module):
        inputs_embeds: torch.FloatTensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutput]:
        r"""
        Args:
@ -385,18 +395,114 @@ class MLCDEncoder(nn.Module):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            hidden_states = encoder_layer(
-                hidden_states,
-                position_embeddings,
-                attention_mask,
-                **kwargs,
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
            )

+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class MLCDVisionTransformer(nn.Module):
+    def __init__(self, config: MLCDVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = MLCDVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = MLCDEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
+        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
+        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
+        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
+        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
+        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
        )


@ -405,15 +511,8 @@ class MLCDPreTrainedModel(PreTrainedModel):
    config: MLCDVisionConfig
    base_model_prefix = "mlcd"
    supports_gradient_checkpointing = True
-    accepts_loss_kwargs = False
    _supports_flash_attn = True
    _supports_sdpa = True
-    _supports_flex_attn = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": MLCDEncoderLayer,
-        "attentions": MLCDAttention,
-    }

    def _init_weights(self, module):
        """Initialize the weights"""
@ -447,55 +546,6 @@ class MLCDPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()


-class MLCDVisionTransformer(nn.Module):
-    def __init__(self, config: MLCDVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-
-        self.embeddings = MLCDVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.encoder = MLCDEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
-        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
-
-    @auto_docstring
-    def forward(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> Union[tuple, BaseModelOutputWithPooling]:
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
-        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
-        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
-        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
-        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
-        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-        position_embeddings = (emb.cos(), emb.sin())
-
-        hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        pooled_output = last_hidden_state[:, 0, :]
-        pooled_output = self.post_layernorm(pooled_output)
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-        )
-
-
@auto_docstring(
    custom_intro="""
    The vision model from M_L_C_D without any head or projection on top.
@ -516,12 +566,13 @@ class MLCDVisionModel(MLCDPreTrainedModel):
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

-    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutputWithPooling]:
        r"""
        Example:
@ -545,9 +596,17 @@ class MLCDVisionModel(MLCDPreTrainedModel):
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
        return self.vision_model(
            pixel_values=pixel_values,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
        )


--- a/src/transformers/models/mlcd/modular_mlcd.py
+++ b/src/transformers/models/mlcd/modular_mlcd.py
@ -19,11 +19,11 @@ import torch
 import torch.nn as nn

 from ...configuration_utils import PreTrainedConfig
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, logging
-from ...utils.generic import check_model_inputs
+from ...utils import auto_docstring, logging
 from ..clip.modeling_clip import (
    CLIPMLP,
    CLIPAttention,
@ -206,7 +206,7 @@ class MLCDAttention(CLIPAttention):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        batch_size, seq_length = hidden_states.shape[:-1]

@ -258,7 +258,7 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.FloatTensor]:
        """
        Args:
@ -270,15 +270,18 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
-            **kwargs,
+            output_attentions=output_attentions,
        )
        hidden_states = residual + hidden_states

@ -287,7 +290,12 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

-        return hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs


 class MLCDEncoder(CLIPEncoder):
@ -308,7 +316,9 @@ class MLCDEncoder(CLIPEncoder):
        inputs_embeds: torch.FloatTensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutput]:
        r"""
        Args:
@ -324,18 +334,107 @@ class MLCDEncoder(CLIPEncoder):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            hidden_states = encoder_layer(
-                hidden_states,
-                position_embeddings,
-                attention_mask,
-                **kwargs,
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
            )

+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class MLCDVisionTransformer(CLIPVisionTransformer):
+    def __init__(self, config: MLCDVisionConfig):
+        super().__init__(config)
+        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
+        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
+        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
+        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
+        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
+        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
        )


@ -344,15 +443,8 @@ class MLCDPreTrainedModel(PreTrainedModel):
    config: MLCDVisionConfig
    base_model_prefix = "mlcd"
    supports_gradient_checkpointing = True
-    accepts_loss_kwargs = False
    _supports_flash_attn = True
    _supports_sdpa = True
-    _supports_flex_attn = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": MLCDEncoderLayer,
-        "attentions": MLCDAttention,
-    }

    def _init_weights(self, module):
        """Initialize the weights"""
@ -386,55 +478,14 @@ class MLCDPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()


-class MLCDVisionTransformer(CLIPVisionTransformer):
-    def __init__(self, config: MLCDVisionConfig):
-        super().__init__(config)
-        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
-        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
-
-    @auto_docstring
-    def forward(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> Union[tuple, BaseModelOutputWithPooling]:
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
-        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
-        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
-        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
-        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
-        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-        position_embeddings = (emb.cos(), emb.sin())
-
-        hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        pooled_output = last_hidden_state[:, 0, :]
-        pooled_output = self.post_layernorm(pooled_output)
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-        )
-
-
 class MLCDVisionModel(CLIPVisionModel):
-    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutputWithPooling]:
        r"""
        Example:
@ -458,9 +509,17 @@ class MLCDVisionModel(CLIPVisionModel):
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
        return self.vision_model(
            pixel_values=pixel_values,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
        )


--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@ -343,12 +343,12 @@ class PLBartEncoder(PLBartPreTrainedModel):
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

+        self.embed_tokens = PLBartScaledWordEmbedding(
+            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = PLBartScaledWordEmbedding(
-                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = PLBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
@ -595,12 +595,12 @@ class PLBartDecoder(PLBartPreTrainedModel):
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

+        self.embed_tokens = PLBartScaledWordEmbedding(
+            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = PLBartScaledWordEmbedding(
-                config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = PLBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@ -25,7 +25,6 @@ from torch import nn
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
-from ...masking_utils import create_bidirectional_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -775,19 +774,14 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
                lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
                # Create mask
                padding_mask = seq_range >= lengths_expand
-                audio_attention_mask_2d = (~padding_mask).to(dtype=torch.long, device=audio_feat_lengths.device)

-                dummy_embeds = torch.zeros(
-                    (batch_size, max_seq_len, 1),
-                    dtype=inputs_embeds.dtype,
-                    device=inputs_embeds.device,
+                audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+                    batch_size, 1, max_seq_len, max_seq_len
                )
-
-                audio_attention_mask = create_bidirectional_mask(
-                    config=self.audio_tower.config,
-                    input_embeds=dummy_embeds,
-                    attention_mask=audio_attention_mask_2d,
+                audio_attention_mask = audio_attention_mask_.to(
+                    dtype=self.audio_tower.conv1.weight.dtype, device=self.audio_tower.conv1.weight.device
                )
+                audio_attention_mask[audio_attention_mask_] = float("-inf")

                audio_outputs = self.audio_tower(input_features, attention_mask=audio_attention_mask)
                selected_audio_feature = audio_outputs.last_hidden_state
--- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
@ -1316,6 +1316,7 @@ class Qwen3VLProcessor(Qwen2VLProcessor):
                video_metadata = videos_inputs.pop("video_metadata")
            else:
                video_metadata = videos_inputs["video_metadata"]
+            video_grid_thw = videos_inputs["video_grid_thw"]
        else:
            videos_inputs = {}
            video_grid_thw = None
--- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@ -157,6 +157,7 @@ class Qwen3VLProcessor(ProcessorMixin):
                video_metadata = videos_inputs.pop("video_metadata")
            else:
                video_metadata = videos_inputs["video_metadata"]
+            video_grid_thw = videos_inputs["video_grid_thw"]
        else:
            videos_inputs = {}
            video_grid_thw = None
--- a/src/transformers/quantizers/quantizer_mxfp4.py
+++ b/src/transformers/quantizers/quantizer_mxfp4.py
@ -383,10 +383,6 @@ class Mxfp4HfQuantizer(HfQuantizer):

        state_dict = model.state_dict()

-        # Get num_local_experts from model config
-        num_local_experts = getattr(model.config, "num_local_experts", 32)
-        hidden_size = getattr(model.config, "hidden_size", 2880)
-
        for name, module in model.named_modules():
            if (
                isinstance(module, Mxfp4GptOssExperts)
@ -396,7 +392,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
                state_dict[f"{name}.gate_up_proj_blocks"] = (
                    module.gate_up_proj.storage.layout.unswizzle_data(module.gate_up_proj.storage.data)
                    .transpose(-1, -2)
-                    .reshape(num_local_experts, -1, 90, 16)
+                    .reshape(32, -1, 90, 16)
                )
                state_dict[f"{name}.gate_up_proj_scales"] = (
                    module.gate_up_proj_precision_config.weight_scale.storage.layout.unswizzle_data(
@ -406,7 +402,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
                state_dict[f"{name}.down_proj_blocks"] = (
                    module.down_proj.storage.layout.unswizzle_data(module.down_proj.storage.data)
                    .transpose(-1, -2)
-                    .reshape(num_local_experts, hidden_size, 90, -1)
+                    .reshape(32, 2880, 90, -1)
                )
                state_dict[f"{name}.down_proj_scales"] = (
                    module.down_proj_precision_config.weight_scale.storage.layout.unswizzle_data(
--- a/src/transformers/utils/chat_parsing_utils.py
+++ b/src/transformers/utils/chat_parsing_utils.py
@ -173,12 +173,12 @@ def recursive_parse(
            return parsed_schema
        elif isinstance(node_content, dict):
            for key, child_node in node_schema.get("properties", {}).items():
-                if "const" in child_node:
-                    parsed_schema[key] = child_node["const"]
-                elif key in node_content:
+                if key in node_content:
                    parsed_schema[key] = recursive_parse(node_content[key], child_node)
                elif "default" in child_node:
                    parsed_schema[key] = child_node["default"]
+                else:
+                    pass
            if "additionalProperties" in node_schema:
                for key, value in node_content.items():
                    if key not in node_schema.get("properties", {}):
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -47,7 +47,8 @@ PACKAGE_DISTRIBUTION_MAPPING = importlib.metadata.packages_distributions()
 def _is_package_available(pkg_name: str, return_version: bool = False) -> tuple[bool, str] | bool:
    """Check if `pkg_name` exist, and optionally try to get its version"""
    spec = importlib.util.find_spec(pkg_name)
-    package_exists = spec is not None
+    # the spec might be not None but not importable
+    package_exists = spec is not None and spec.loader is not None
    package_version = "N/A"
    if package_exists and return_version:
        try:
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@ -16,7 +16,6 @@
 import copy
 import tempfile
 import unittest
-import unittest.mock
 from functools import cached_property

 import timeout_decorator  # noqa
@ -478,23 +477,6 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
            with torch.no_grad():
                model(**inputs)[0]

-    def test_input_embeddings_support_forward_hook(self):
-        # Make sure that registering hooks on the input embeddings are indeed called
-        # in forward. This is necessary for gradient checkpointing in PEFT, see also #41821.
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            hook = unittest.mock.MagicMock(return_value=None)
-            model.get_input_embeddings().register_forward_hook(hook)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            model(**inputs)
-
-            self.assertGreater(hook.call_count, 0)
-
    @require_torch_fp16
    def test_generate_fp16(self):
        config, input_dict = self.model_tester.prepare_config_and_inputs()
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@ -28,6 +28,7 @@ from transformers.testing_utils import (
    require_torch,
    require_torch_accelerator,
    require_torch_fp16,
+    require_torch_gpu,
    require_torch_multi_accelerator,
    require_vision,
    slow,
@ -1733,7 +1734,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
        self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
        self.assertEqual(generated_text, expected_ids_and_text[1])

-    @require_torch_accelerator
+    @require_torch_gpu
    def test_inference_itm(self):
        model_name = "Salesforce/blip2-itm-vit-g"
        processor = Blip2Processor.from_pretrained(model_name)
--- a/tests/models/falcon_h1/test_modeling_falcon_h1.py
+++ b/tests/models/falcon_h1/test_modeling_falcon_h1.py
@ -23,7 +23,7 @@ from transformers.testing_utils import (
    Expectations,
    get_device_properties,
    require_torch,
-    require_torch_accelerator,
+    require_torch_gpu,
    slow,
    torch_device,
 )
@ -400,7 +400,7 @@ class FalconH1ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM

@slow
@require_torch
-@require_torch_accelerator
+@require_torch_gpu
 class FalconH1ModelIntegrationTest(unittest.TestCase):
    @slow
    def test_falcon_h1_hard(self):
@ -448,36 +448,10 @@ class FalconH1ModelIntegrationTest(unittest.TestCase):
            6.
        """

-        EXPECTED_TEXT_XPU = """
-            user
-            Tell me about the french revolution.
-            assistant
-            The French Revolution (1789–1799) was a period of radical social and political upheaval in France that fundamentally transformed the nation and had profound effects on the rest of Europe and the world. Here are the key aspects of the revolution:
-
-            ### **Causes**
-            1. **Economic Crisis**: France was in severe financial trouble due to costly wars (particularly the American Revolution), extravagant spending by the monarchy, and inefficient taxation.
-            2. **Social Inequality**: The rigid class system (the Ancien Régime) favored the nobility and clergy while the majority of the population (the Third Estate) bore the brunt of taxation and had limited rights.
-            3. **Enlightenment Ideas**: Philosophers like Rousseau, Voltaire, and Montesquieu inspired ideas of liberty, equality, and popular sovereignty.
-            4. **Settlement of 1789**: The Estates-General convened to address the financial crisis, leading to debates that exposed the weaknesses of the monarchy and the grievances of the common people.
-
-            ### **Key Events**
-            1. **Opening of the Revolution (1789)**:
-               - **Storming of the Bastille**: A symbol of royal tyranny, marking the start of the revolution.
-               - **Declaration of the Rights of Man and of the Citizen**: A foundational document proclaiming liberty, equality, and fraternity.
-
-            2. **Stages of the Revolution**:
-               - **Staffords' Reforms (1789–1791)**: Attempts to address grievances, including the abolition of feudal privileges and the introduction of the Civil Constitution of the Church.
-               - **Reign of Terror (1793–1794)**: Led by Maximilien Robespierre, characterized by mass executions of perceived enemies of the revolution, including King Louis XVI and Queen Marie Antoinette.
-               - **Thermidorian Reaction (1794)**: The fall of Robespierre and the end of the Reign of Terror.
-
-            3. **
-        """
-
        expected_texts = Expectations(
            {
                (None, None): EXPECTED_TEXT_DEFAULT,
                ("cuda", 8): EXPECTED_TEXT_A10,
-                ("xpu", None): EXPECTED_TEXT_XPU,
            }
        )
        EXPECTED_TEXT = expected_texts.get_expectation()
@ -492,9 +466,10 @@ class FalconH1ModelIntegrationTest(unittest.TestCase):
        model_id = "tiiuae/Falcon-H1-1.5B-Deep-Instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = FalconH1ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")
+        device = "cuda"
        messages = [{"role": "user", "content": "Tell me about the french revolution."}]
        input_text = tokenizer.apply_chat_template(messages, tokenize=False)
-        inputs = tokenizer.encode(input_text, return_tensors="pt").to(torch_device)
+        inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(inputs, max_new_tokens=512, do_sample=False)
--- a/tests/models/helium/test_modeling_helium.py
+++ b/tests/models/helium/test_modeling_helium.py
@ -48,6 +48,7 @@ class HeliumModelTest(CausalLMModelTest, unittest.TestCase):


@slow
+# @require_torch_gpu
 class HeliumIntegrationTest(unittest.TestCase):
    input_text = ["Hello, today is a great day to"]

--- a/tests/models/mlcd/test_modeling_mlcd.py
+++ b/tests/models/mlcd/test_modeling_mlcd.py
@ -146,7 +146,7 @@ class MLCDVisionModelIntegrationTest(unittest.TestCase):
    @slow
    def test_inference(self):
        model_name = "DeepGlint-AI/mlcd-vit-bigG-patch14-448"
-        model = MLCDVisionModel.from_pretrained(model_name, attn_implementation="eager").to(torch_device)
+        model = MLCDVisionModel.from_pretrained(model_name).to(torch_device)
        processor = AutoProcessor.from_pretrained(model_name)

        # process single image
--- a/tests/quantization/bitnet_integration/test_bitnet.py
+++ b/tests/quantization/bitnet_integration/test_bitnet.py
@ -25,7 +25,7 @@ from transformers import (
 from transformers.testing_utils import (
    backend_empty_cache,
    require_accelerate,
-    require_torch_accelerator,
+    require_torch_gpu,
    slow,
    torch_device,
 )
@ -39,7 +39,7 @@ if is_accelerate_available():
    from accelerate import init_empty_weights


-@require_torch_accelerator
+@require_torch_gpu
 class BitNetQuantConfigTest(unittest.TestCase):
    def test_to_dict(self):
        """
@ -53,7 +53,7 @@ class BitNetQuantConfigTest(unittest.TestCase):


@slow
-@require_torch_accelerator
+@require_torch_gpu
@require_accelerate
 class BitNetTest(unittest.TestCase):
    model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
@ -197,7 +197,7 @@ class BitNetTest(unittest.TestCase):


@slow
-@require_torch_accelerator
+@require_torch_gpu
@require_accelerate
 class BitNetSerializationTest(unittest.TestCase):
    def test_model_serialization(self):
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@ -4142,7 +4142,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            # perfect world: fp32_init/2 == fp16_eval
            self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)

-    @require_torch_accelerator
+    @require_torch_gpu
    @pytest.mark.torch_compile_test
    def test_torch_compile_train(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
@ -4154,7 +4154,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            metrics = trainer.train()
            self.assertAlmostEqual(metrics.training_loss, original_train_loss)

-    @require_torch_accelerator
+    @require_torch_gpu
    @pytest.mark.torch_compile_test
    def test_torch_compile_eval(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
@ -4165,7 +4165,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            trainer = get_regression_trainer(torch_compile=True, output_dir=tmp_dir)
            metrics = trainer.evaluate()

-            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss, delta=1e-6)
+            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)

    @require_torch_accelerator
    @require_torch_bf16
--- a/tests/utils/test_chat_parsing_utils.py
+++ b/tests/utils/test_chat_parsing_utils.py
@ -281,7 +281,6 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "thinking": 'Okay, the user said, "Hello! How are you?" I need to respond appropriately. Since this is the first message, I should greet them back and ask how I can assist. I should keep it friendly and open-ended. Let me make sure the response is welcoming and encourages them to share what they need help with. I\'ll avoid any technical jargon and keep it simple. Let me check for any typos and ensure the tone is positive.',
                "tool_calls": [
                    {
@ -303,10 +302,9 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "tool_calls": [
                    {"type": "function", "function": {"name": "get_weather", "arguments": {"city": "Paris"}}}
-                ],
+                ]
            },
        )

@ -316,7 +314,6 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "content": "Some content about gravity goes here but I'm cutting it off to make this shorter!",
                "thinking": 'Okay, the user asked, "Hey! Can you tell me about gravity?" Let me start by breaking down what they might be looking for. They probably want a basic understanding of gravity, maybe for a school project or just personal curiosity. I should explain what gravity is, how it works, and maybe some examples.',
            },
@ -328,7 +325,6 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "tool_calls": [
                    {
                        "type": "function",
@ -340,6 +336,6 @@ class ChatSchemaParserTest(unittest.TestCase):
                            },
                        },
                    }
-                ],
+                ]
            },
        )
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@ -97,17 +97,7 @@ def is_bad_commit(target_test, commit):
    # Restore to original commit
    repo.git.checkout(original_head)

-    n_passed = 0
-    o = re.findall(r"====.* (\d+) passed", result.stdout)
-    if len(o) > 0:
-        n_passed = int(o[0])
-
-    n_failed = 0
-    o = re.findall(r"====.* (\d+) failed", result.stdout)
-    if len(o) > 0:
-        n_failed = int(o[0])
-
-    return result.returncode != 0, n_failed, n_passed
+    return result.returncode != 0


 def find_bad_commit(target_test, start_commit, end_commit):
@ -123,8 +113,7 @@ def find_bad_commit(target_test, start_commit, end_commit):
    """

    # check if `end_commit` fails the test
-    # (we only need one failure to conclude the test is flaky on the previous run with `end_commit`)
-    failed_before, _, _ = is_bad_commit(target_test, end_commit)
+    failed_before = is_bad_commit(target_test, end_commit)
    if failed_before:
        return (
            None,
@ -141,9 +130,8 @@ def find_bad_commit(target_test, start_commit, end_commit):

    # Now, we are (almost) sure `target_test` is not failing at `end_commit`
    # check if `start_commit` fail the test
-    # **IMPORTANT** we only need one pass to conclude the test is flaky on the current run with `start_commit`!
-    _, n_failed, n_passed = is_bad_commit(target_test, start_commit)
-    if n_passed > 0:
+    failed_now = is_bad_commit(target_test, start_commit)
+    if not failed_now:
        # failed on CI run, but not reproducible here --> don't report
        return None, f"flaky: test fails on the current CI run (commit: {start_commit}) but passes during the check."

@ -206,13 +194,12 @@ def get_commit_info(commit):
        if pr_for_commit["merged_by"] is not None:
            merged_author = pr_for_commit["merged_by"]["login"]

-    url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}"
-    commit_info = requests.get(url).json()
-    parent = commit_info["parents"][0]["sha"]
    if author is None:
+        url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}"
+        commit_info = requests.get(url).json()
        author = commit_info["author"]["login"]

-    return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author, "parent": parent}
+    return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author}


 if __name__ == "__main__":
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@ -1407,10 +1407,7 @@ if __name__ == "__main__":
    if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
        os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))

-    nvidia_daily_ci_workflow = (
-        "huggingface/transformers/.github/workflows/self-scheduled-caller.yml",
-        "huggingface/transformers/.github/workflows/self-scheduled-flash-attn-caller.yml",
-    )
+    nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml"
    amd_daily_ci_workflows = (
        "huggingface/transformers/.github/workflows/self-scheduled-amd-mi325-caller.yml",
        "huggingface/transformers/.github/workflows/self-scheduled-amd-mi355-caller.yml",
Author	SHA1	Message	Date
Vasqu	2be1dd7af2	no changes	2025-10-24 10:47:55 +02:00
Anton Vlasjuk	a20a326408	Merge branch 'main' into test-bart-dummy	2025-10-24 10:31:20 +02:00
Vasqu	12b253e921	test	2025-10-23 20:23:42 +02:00
Vasqu	cecef75790	init	2025-10-23 20:17:07 +02:00