fix(ci): improper git sha checkout out

refactor(benchmarks): change default lvls of cb bench config
refactor(benchmarks): error if no benchmarks were run
2025-11-15 15:15:02 +08:00 · 2025-11-07 17:11:17 +01:00 · 2025-11-07 15:35:57 +00:00 · 2025-11-07 15:35:57 +00:00 · 2025-11-07 15:35:57 +00:00 · 2025-11-07 15:35:57 +00:00
394 changed files with 2882 additions and 5940 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -40,6 +40,7 @@ jobs:
        run: python3 -m pip install -r benchmark_v2/requirements.txt kernels

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"

      - name: Run benchmark
--- a/.github/workflows/check-workflow-permissions.yml
+++ b/.github/workflows/check-workflow-permissions.yml
@ -20,4 +20,4 @@ jobs:
      contents: read
    with:
      workflow_name: ${{ inputs.workflow_name }}
-      run_count: ${{ fromJSON(inputs.run_count) }}
+      run_count: ${{ fromJSON(inputs.run_count) }}
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -87,6 +87,9 @@ jobs:
      PR_FILES: ${{ steps.pr_info.outputs.files }}
    if: ${{ inputs.pr_number != '' }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Extract PR details
        id: pr_info
        uses: actions/github-script@v6
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -13,6 +13,9 @@ jobs:
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Get PR number
        shell: bash
        env:
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -13,6 +13,9 @@ jobs:
    name: Notify new model
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
--- a/.github/workflows/pr_build_doc_with_comment.yml
+++ b/.github/workflows/pr_build_doc_with_comment.yml
@ -35,6 +35,9 @@ jobs:
      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - run: |
          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
          echo "COMMENT_DATE: $COMMENT_DATE"
@ -54,6 +57,9 @@ jobs:
      statuses: write
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Create Run
        id: create_run
        env:
@ -77,6 +83,9 @@ jobs:
      pull-requests: write
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Reply to the comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@ -112,6 +121,9 @@ jobs:
      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.create_run.result) }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Get `build-doc` job status
        run: |
          echo "${{ needs.build-doc.result }}"
--- a/.github/workflows/pr_slow_ci_suggestion.yml
+++ b/.github/workflows/pr_slow_ci_suggestion.yml
@ -23,6 +23,10 @@ jobs:
    outputs:
      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+      
      # This checkout to the main branch
      - uses: actions/checkout@v4
        with:
@ -89,6 +93,10 @@ jobs:
      pull-requests: write
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Check and update comment if needed
        uses: actions/github-script@v7
        env:
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -11,6 +11,10 @@ jobs:
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Check out code
        uses: actions/checkout@v4

--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@ -18,6 +18,10 @@ jobs:
        shell: bash -l {0}

    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Checkout repository
        uses: actions/checkout@v4

--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -46,6 +46,10 @@ jobs:
      PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
      PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Verify `merge_commit` timestamp is older than the issue comment timestamp
        env:
          COMMENT_DATE: ${{ github.event.comment.created_at }}
@ -67,6 +71,10 @@ jobs:
      models: ${{ steps.models_to_run.outputs.models }}
      quantizations: ${{ steps.models_to_run.outputs.quantizations }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - uses: actions/checkout@v4
        with:
          fetch-depth: "0"
@ -109,6 +117,10 @@ jobs:
      pull-requests: write
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Reply to the comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@ -131,6 +143,10 @@ jobs:
      pull-requests: write
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Reply to the comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@ -152,6 +168,10 @@ jobs:
      statuses: write
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Create Run
        id: create_run
        env:
@ -210,6 +230,10 @@ jobs:
    if: ${{ always() && needs.create_run.result == 'success' }}
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Show reports from jobs
        env:
          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@ -30,6 +30,10 @@ jobs:
    name: Setup
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Setup
        run: |
          mkdir "setup_values"
--- a/.github/workflows/self-nightly-past-ci-caller.yml
+++ b/.github/workflows/self-nightly-past-ci-caller.yml
@ -14,6 +14,9 @@ jobs:
    outputs:
      run_number: ${{ steps.get_number.outputs.run_number }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Get number
        id: get_number
        run: |
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@ -10,5 +10,9 @@ jobs:
    runs-on: ubuntu-22.04
    if: ${{ always() }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Trigger scheduled AMD CI via workflow_run
        run: echo "Trigger scheduled AMD CI via workflow_run"
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -32,6 +32,9 @@ jobs:
    name: Setup
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Setup
        env:
          prev_workflow_run_id: ${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}
--- a/.github/workflows/self-scheduled-flash-attn-caller.yml
+++ b/.github/workflows/self-scheduled-flash-attn-caller.yml
@ -32,6 +32,10 @@ jobs:
    name: Setup
    runs-on: ubuntu-22.04
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Setup
        run: |
          mkdir "setup_values"
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@ -38,6 +38,10 @@ jobs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Checkout
        uses: actions/checkout@v4
        with:
@ -122,6 +126,10 @@ jobs:
        --cap-add=sys_nice
        --shm-size=64G
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Checkout
        uses: actions/checkout@v4
        with:
@ -191,6 +199,10 @@ jobs:
        --cap-add=sys_nice
        --shm-size=64G
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Checkout
        uses: actions/checkout@v4
        with:
@ -263,6 +275,10 @@ jobs:
        --cap-add=sys_nice
        --shm-size=64G
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Checkout
        uses: actions/checkout@v4
        with:
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -78,6 +78,9 @@ jobs:
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
      quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Update clone
        working-directory: /transformers
        env:
@ -184,6 +187,9 @@ jobs:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Update clone
        working-directory: /transformers
        env:
@ -256,6 +262,9 @@ jobs:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Update clone
        working-directory: /transformers
        env:
@ -329,6 +338,9 @@ jobs:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Update clone
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        env:
@ -434,6 +446,9 @@ jobs:
      image: huggingface/transformers-quantization-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
        env:
@ -518,6 +533,9 @@ jobs:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Update clone
        working-directory: /transformers
        env:
@ -588,6 +606,9 @@ jobs:
    steps:
      # Checkout in order to run `utils/extract_warnings.py`. Avoid **explicit** checkout (i.e. don't specify `ref`) for
      # security reason.
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
      - name: Checkout transformers
        uses: actions/checkout@v4

--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -38,6 +38,10 @@ jobs:
    runs-on: ubuntu-22.04
    if: always() && !cancelled()
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Preliminary job status
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -30,6 +30,10 @@ jobs:
    outputs:
      RUNNER: ${{ steps.set_runner.outputs.RUNNER }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Get runner to use
        shell: bash
        env:
@ -58,6 +62,10 @@ jobs:
    container:
      image: ${{ github.event.inputs.docker_image }}
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Update clone
        working-directory: /transformers
        env:
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -14,16 +14,21 @@ jobs:
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
-    - uses: actions/checkout@v4
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}

-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: 3.8
+      - name: Checkout
+        uses: actions/checkout@v4

-    - name: Install requirements
-      run: |
-        pip install PyGithub
-    - name: Close stale issues
-      run: |
-        python scripts/stale.py
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.8
+
+      - name: Install requirements
+        run: |
+          pip install PyGithub
+      - name: Close stale issues
+        run: |
+          python scripts/stale.py
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -10,6 +10,10 @@ jobs:
  trufflehog:
    runs-on: ubuntu-latest
    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - name: Checkout code
        uses: actions/checkout@v4
        with:
--- a/.github/workflows/update_metdata.yml
+++ b/.github/workflows/update_metdata.yml
@ -14,6 +14,10 @@ jobs:
        shell: bash -l {0}

    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+        with:
+          config: ${{ vars.PERMISSIONS_CONFIG }}
+
      - uses: actions/checkout@v4

      - name: Setup environment
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@ -117,6 +117,8 @@ def flush_memory():
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
    gc.collect()

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -1008,8 +1008,6 @@
        title: AltCLIP
      - local: model_doc/aria
        title: Aria
-      - local: model_doc/audioflamingo3
-        title: AudioFlamingo3
      - local: model_doc/aya_vision
        title: AyaVision
      - local: model_doc/blip
--- a/docs/source/en/model_doc/audioflamingo3.md
+++ b/docs/source/en/model_doc/audioflamingo3.md
@ -1,402 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-*This model was released on 2025-07-10 and added to Hugging Face Transformers on 2025-11-11.*
-
-# Audio Flamingo 3
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-## Overview
-
-Audio Flamingo 3 (AF3) is a fully open large audio–language model designed for robust understanding and reasoning over speech, environmental sounds, and music. AF3 pairs a Whisper-style audio encoder with a causal language model and performs replace-in-place audio–text fusion: the processor aligns post-pool audio frames to a dedicated placeholder token and the model replaces those token slots with projected audio embeddings during the forward pass.
-
-The model checkpoint is available at: [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
-
-Highlights:
-
- Unified audio encoder across speech, sound, and music.
- **Long-audio support via windowing and post-pool alignment (up to 10 minutes maximum).** The model processes audio in 30-second windows with a hard limit of 20 windows (10 minutes total). Audio longer than 10 minutes will be truncated.
- Deterministic fusion that preserves sequence length by replacing audio placeholder tokens with audio embeddings.
-
-This model was contributed by [Lasha Koroshinadze](https://huggingface.co/lashahub) and [Eric Bezzam](https://huggingface.co/bezzam).
-
-### Paper
-
-[Audio Flamingo 3](https://huggingface.co/papers/2507.08128): Advancing Audio Intelligence with Fully Open Large Audio Language Models  
-A. Goel, S. Ghosh, J. Kim, S. Kumar, Z. Kong, S. Lee, C.-H. H. Yang, R. Duraiswami, D. Manocha, R. Valle, B. Catanzaro  
-NVIDIA and University of Maryland  
-Project: https://research.nvidia.com/labs/adlr/AF3/
-
-## Usage
-
-### Audio Instruct Mode
-
-The model supports audio-text instructions, including multi-turn interactions, all processed in batches.
-
-➡️ audio + text instruction
-
-```python
-from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/audio-flamingo-3-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Transcribe the input speech."},
-            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ multi-turn:
-
-```python
-from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/audio-flamingo-3-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "Instruction: How does the tone of female speech change throughout the audio? Choose the correct option among the options below: (A) Sad to happy (B) Happy to sad (C) Neutral to happy (D) Happy to neutral.",
-            },
-            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/000000786159.31.wav"},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": [{"type": "text", "text": "(A) Sad to happy"}],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Why do you think so?"},
-        ],
-    },
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ text only:
-
-```python
-from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/audio-flamingo-3-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "What is the capital of France?"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ audio only:
-
-```python
-from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/audio-flamingo-3-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ batched inference!
-
-```python
-from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/audio-flamingo-3-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversations = [
-    [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "Transcribe the input speech."},
-                {
-                    "type": "audio",
-                    "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
-                },
-            ],
-        }
-    ],
-    [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
-                },
-                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
-            ],
-        }
-    ],
-]
-
-inputs = processor.apply_chat_template(
-    conversations,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ Training:
-
-```python
-from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/audio-flamingo-3-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-model.train()
-
-conversation = [
-    [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "Transcribe the input speech."},
-                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
-            ],
-        },
-        {
-            "role": "assistant",
-            "content": [{"type": "text", "text": "The transcription of the audio is 'summer follows spring the days grow longer and the nights are warm'."}],
-        }
-    ],
-    [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
-                },
-                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
-            ],
-        },
-        {
-            "role": "assistant",
-            "content": [{"type": "text", "text": "The transcription of the audio is 'some transcription of the audio'."}],
-        }
-
-    ]
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-    output_labels=True,
-).to(model.device)
-
-loss = model(**inputs).loss
-loss.backward()
-```
-
-➡️ transcription shortcut
-
-```python
-from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/audio-flamingo-3-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-inputs = processor.apply_transcription_request(audio="https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav").to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True, strip_prefix=True)
-
-print(decoded_outputs)
-```
-
-The model is trained to emit transcriptions prefixed with assistant framing such as `The spoken content of the audio is "<text>".`. Use `strip_prefix=True` (as shown above) to remove the fixed assistant sentence and surrounding quotes so that only the transcription remains.
-
-## How the model works
-
-### Architecture
-
-* **AudioFlamingo3Encoder**
-  Whisper-style feature extractor + encoder → average-pool over time (stride 2) → LayerNorm.
-  Produces per-frame hidden states at the post-pool rate.
-
-* **AudioFlamingo3MultiModalProjector**
-  A small MLP that maps encoder features to the language model’s hidden size.
-
-* **AudioFlamingo3ForConditionalGeneration**
-  A causal language model that accepts text embeddings where each audio placeholder token slot is replaced, in place, by an audio frame embedding. No sequence-length change is introduced by fusion.
-
-### Processor-level alignment
-
-1. Each raw waveform is split into fixed-length windows based on the feature extractor’s `chunk_length` (seconds) and `sampling_rate` (Hz).
-2. For each window, the processor computes the number of post-pool frames `post_pool_len` that the encoder will output (matching the conv/pool schedule).
-3. The processor expands the audio placeholder token by the total number of post-pool frames across all windows.
-4. The model later replaces those token positions with the corresponding projected audio embeddings.
-
-## Usage patterns
-
-### Transcription shortcut
-
-For automatic speech recognition you can skip writing the default instruction each time and call
-[`~transformers.AudioFlamingo3Processor.apply_transcription_request`]:
-
-```python
-inputs = processor.apply_transcription_request(audio=audio_array)
-```
-
-Pass `prompt="Transcribe the input speech."` (or a list of prompts for batch audio) to customize the instruction while
-keeping the audio placeholder handling.
-
-`audio` accepts in-memory arrays, local file paths, or URLs. Any processor kwargs (`text_kwargs`, `audio_kwargs`, etc.)
-are forwarded, so you can tweak padding or tensor formats just like when calling `processor(...)`.
-
-## Long audio and windowing
-
-**Important: Maximum audio length is 10 minutes.** Audio longer than this will be truncated.
-
-* The default setup processes 30-second windows at 16 kHz mono.
-* **The processor enforces a hard limit of 20 windows per sample, resulting in a maximum of 10 minutes of audio (20 windows × 30 seconds).**
-* For each window:
-
-  * `mel_len` is the padded mel length.
-  * A conv stack reduces time as `conv_output_len = (mel_len - 1) // 2 + 1`.
-  * Post-pool frames per window: `post_pool_len = (conv_output_len - 2) // 2 + 1`.
-  * An audio placeholder token is expanded to the sum of `post_pool_len` across all windows.
-
-## Padding, attention, and caching
-
-* **Left padding vs right padding**
-  For generation with mixed prompt lengths in a batch, left padding is usually preferable.
-  For training, right padding is common; AF3’s fusion mechanism itself is padding-agnostic because it replaces in place.
-* **Attention masks**
-  The processor returns `attention_mask` (text) and `input_features_mask` (audio). The model builds an internal 4-D mask on the encoder’s pre-pool axis with negative infinity at pad positions.
-* **Caching**
-  During generation, `input_features` and `input_features_mask` are only passed on the first step. Subsequent steps use cached keys/values from the language model.
-
-## Troubleshooting
-
-* Empty or truncated outputs when batching
-  Use left padding for batched generation and decode only the new tokens after the prompt length, as shown in the quickstart.
-
-## AudioFlamingo3Config
-
-[[autodoc]] AudioFlamingo3Config
-
-## AudioFlamingo3EncoderConfig
-
-[[autodoc]] AudioFlamingo3EncoderConfig
-
-## AudioFlamingo3Processor
-
-[[autodoc]] AudioFlamingo3Processor
-
-## AudioFlamingo3Encoder
-
-[[autodoc]] AudioFlamingo3Encoder
-    - forward
-
-## AudioFlamingo3ForConditionalGeneration
-
-[[autodoc]] AudioFlamingo3ForConditionalGeneration
-    - forward
--- a/docs/source/en/model_doc/dinov3.md
+++ b/docs/source/en/model_doc/dinov3.md
@ -169,9 +169,6 @@ print("Pooled output shape:", pooled_output.shape)
 [[autodoc]] DINOv3ViTModel
    - forward

-## DINOv3ViTBackbone    
-[[autodoc]] DINOv3ViTBackbone
-
 ## DINOv3ConvNextModel

 [[autodoc]] DINOv3ConvNextModel
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -159,7 +159,7 @@ conversation3 = [

 conversations = [conversation1, conversation2, conversation3]
 inputs = processor.apply_chat_template(
-    conversations,
+    conversation,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -329,7 +329,7 @@ from torchao.dtypes import Int4XPULayout
 from torchao.quantization.quant_primitives import ZeroPointDomain


-quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT, int4_packing_format="plain_int32")
+quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -342,7 +342,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
+input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

 # auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@ -395,7 +395,7 @@ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 from torchao.quantization import Int4WeightOnlyConfig
 from torchao.dtypes import Int4CPULayout

-quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout(), int4_packing_format="opaque")
+quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout())
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -422,7 +422,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))

 #### 1. Skip quantization for certain layers

-With `FqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
+With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.

 ```py
 import torch
@ -430,11 +430,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig

 model_id = "meta-llama/Llama-3.1-8B-Instruct"

-from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig
+from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig
 config = Int4WeightOnlyConfig(group_size=128)

 # set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
-quant_config = FqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
+quant_config = ModuleFqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
 quantization_config = TorchAoConfig(quant_type=quant_config)
 quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
 # lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
@ -459,7 +459,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig

 model_id = "facebook/opt-125m"

-from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
+from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType

 weight_dtype = torch.int8
 granularity = PerAxis(0)
@ -470,7 +470,7 @@ embedding_config = IntxWeightOnlyConfig(
    mapping_type=mapping_type,
 )
 linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
-quant_config = FqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
+quant_config = ModuleFqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
 # set `include_embedding` to True in order to include embedding in quantization
 # when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
 quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
@ -521,7 +521,7 @@ from torchao.quantization import (
    IntxWeightOnlyConfig,
    PerRow,
    PerAxis,
-    FqnToConfig,
+    ModuleFqnToConfig,
    Float8Tensor,
    Int4TilePackedTo4dTensor,
    IntxUnpackedToInt8Tensor,
@ -550,7 +550,7 @@ qconfig_dict = {

    "_default": intxwo,
 }
-quant_config = FqnToConfig(qconfig_dict)
+quant_config = ModuleFqnToConfig(qconfig_dict)
 quantization_config = TorchAoConfig(quant_type=quant_config)
 quantized_model = AutoModelForCausalLM.from_pretrained(
    model_id,
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -127,7 +127,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -132,7 +132,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -130,7 +130,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -128,7 +128,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the HuggingFace Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -151,7 +151,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -223,7 +223,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@ -74,7 +74,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdir)

-    @slow
    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
    def test_run_glue_no_trainer(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
@ -148,7 +147,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
        self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
        self.assertTrue(os.path.exists(os.path.join(tmp_dir, "mlm_no_trainer")))

-    @slow
    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
    def test_run_ner_no_trainer(self):
        # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
@ -177,7 +175,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
        self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
        self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))

-    @slow
    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
    def test_run_squad_no_trainer(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
@ -206,7 +203,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
        self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
        self.assertTrue(os.path.exists(os.path.join(tmp_dir, "qa_no_trainer")))

-    @slow
    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
    def test_run_swag_no_trainer(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
@ -309,7 +305,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
        result = get_results(tmp_dir)
        self.assertGreaterEqual(result["eval_overall_accuracy"], 0.10)

-    @slow
    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
    def test_run_image_classification_no_trainer(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@ -374,7 +374,6 @@ class ExamplesTests(TestCasePlus):
            result = get_results(tmp_dir)
            self.assertGreaterEqual(result["eval_bleu"], 30)

-    @slow
    def test_run_image_classification(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
        testargs = f"""
@ -404,7 +403,6 @@ class ExamplesTests(TestCasePlus):
            result = get_results(tmp_dir)
            self.assertGreaterEqual(result["eval_accuracy"], 0.8)

-    @slow
    def test_run_speech_recognition_ctc(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
        testargs = f"""
@ -575,7 +573,6 @@ class ExamplesTests(TestCasePlus):
            model = ViTMAEForPreTraining.from_pretrained(tmp_dir)
            self.assertIsNotNone(model)

-    @slow
    def test_run_semantic_segmentation(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
        testargs = f"""
@ -600,7 +597,6 @@ class ExamplesTests(TestCasePlus):
            result = get_results(tmp_dir)
            self.assertGreaterEqual(result["eval_overall_accuracy"], 0.1)

-    @slow
    @patch.dict(os.environ, {"WANDB_DISABLED": "true"})
    def test_run_object_detection(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
@ -628,7 +624,6 @@ class ExamplesTests(TestCasePlus):
            result = get_results(tmp_dir)
            self.assertGreaterEqual(result["test_map"], 0.1)

-    @slow
    @patch.dict(os.environ, {"WANDB_DISABLED": "true"})
    def test_run_instance_segmentation(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -120,7 +120,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -212,7 +212,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@ -50,7 +50,6 @@ checkpoint: 检查点
 </p>

 <p align="center">
-    <a href="https://huggingface.co/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a>
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
    <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
@ -61,7 +60,7 @@ checkpoint: 检查点

 <h4 align="center">
    <p>
-        <a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
+        <a href="https://github.com/huggingface/transformers/">English</a> |
        <b>简体中文</b> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hant.md">繁體中文</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ko.md">한국어</a> |
@ -69,7 +68,7 @@ checkpoint: 检查点
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
@ -82,258 +81,182 @@ checkpoint: 检查点
 </h4>

 <h3 align="center">
-    <p>为文本、视觉、音频、视频与多模态提供推理与训练的先进预训练模型</p>
+    <p>为 Jax、PyTorch 和 TensorFlow 打造的先进的自然语言处理函数库</p>
 </h3>

 <h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>

-Transformers 充当跨文本、计算机视觉、音频、视频与多模态的最先进机器学习模型的「模型定义框架」，同时覆盖推理与训练。
+🤗 Transformers 提供了数以千计的预训练模型，支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨是让最先进的 NLP 技术人人易用。

-它将模型的定义集中化，使整个生态系统对该定义达成一致。`transformers` 是跨框架的枢纽：一旦某模型定义被支持，它通常就能兼容多数训练框架（如 Axolotl、Unsloth、DeepSpeed、FSDP、PyTorch‑Lightning 等）、推理引擎（如 vLLM、SGLang、TGI 等），以及依赖 `transformers` 模型定义的相关库（如 llama.cpp、mlx 等）。
+🤗 Transformers 提供了便于快速下载和使用的API，让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时，每个定义的 Python 模块都是完全独立的，便于修改和快速进行研究实验。

-我们的目标是持续支持新的最先进模型，并通过让模型定义保持简单、可定制且高效来普及其使用。
+🤗 Transformers 支持三个最热门的深度学习库： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。

-目前在 [Hugging Face Hub](https://huggingface.com/models) 上有超过 1M+ 使用 `transformers` 的[模型检查点](https://huggingface.co/models?library=transformers&sort=trending)，可随取随用。
- 
-今天就去探索 Hub，找到一个模型，并用 Transformers 立刻开始吧。
+## 在线演示

-## 安装
+你可以直接在模型页面上测试大多数 [model hub](https://huggingface.co/models) 上的模型。 我们也提供了 [私有模型托管、模型版本管理以及推理API](https://huggingface.co/pricing)。

-Transformers 支持 Python 3.9+，以及 [PyTorch](https://pytorch.org/get-started/locally/) 2.1+。
+这里是一些例子：
+- [用 BERT 做掩码填词](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 Electra 做命名实体识别](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [用 GPT-2 做文本生成](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然语言推理](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [用 DistilBERT 做问答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻译](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)

-使用 [venv](https://docs.python.org/3/library/venv.html) 或 [uv](https://docs.astral.sh/uv/)（一个基于 Rust 的快速 Python 包与项目管理器）创建并激活虚拟环境：
+**[Write With Transformer](https://transformer.huggingface.co)**，由 Hugging Face 团队打造，是一个文本生成的官方 demo。

-```py
-# venv
-python -m venv .my-env
-source .my-env/bin/activate
-# uv
-uv venv .my-env
-source .my-env/bin/activate
-```
+## 如果你在寻找由 Hugging Face 团队提供的定制化支持服务

-在虚拟环境中安装 Transformers：
-
-```py
-# pip
-pip install "transformers[torch]"
-
-# uv
-uv pip install "transformers[torch]"
-```
-
-如果你需要库中的最新改动或计划参与贡献，可从源码安装（注意：最新版可能不稳定；如遇错误，欢迎在 [issues](https://github.com/huggingface/transformers/issues) 中反馈）：
-
-```shell
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-
-# pip
-pip install '.[torch]'
-
-# uv
-uv pip install '.[torch]'
-```
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>

 ## 快速上手

-使用 [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API 一步上手。`Pipeline` 是一个高级推理类，支持文本、音频、视觉与多模态任务，负责输入预处理并返回适配的输出。
+我们为快速使用模型提供了 `pipeline` API。Pipeline 聚合了预训练模型和对应的文本预处理。下面是一个快速使用 pipeline 去判断正负面情绪的例子：

-实例化一个用于文本生成的 pipeline，指定使用的模型。模型会被下载并缓存，方便复用。最后传入文本作为提示：
+```python
+>>> from transformers import pipeline

-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B")
-pipeline("the secret to baking a really good cake is ")
-[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}]
+# 使用情绪分析 pipeline
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
 ```

-要与模型进行「聊天」，用法也一致。唯一不同是需要构造一段「聊天历史」（即 `Pipeline` 的输入）：
+第二行代码下载并缓存了 pipeline 使用的预训练模型，而第三行代码则在给定的文本上进行了评估。这里的答案"正面" (positive) 具有 99 的置信度。

-> [!TIP]
-> 你也可以直接在命令行与模型聊天：
-> ```shell
-> transformers chat Qwen/Qwen2.5-0.5B-Instruct
-> ```
+许多的 NLP 任务都有开箱即用的预训练 `pipeline`。比如说，我们可以轻松的从给定文本中抽取问题答案：

-```py
-import torch
-from transformers import pipeline
+``` python
+>>> from transformers import pipeline

-chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
-]
+# 使用问答 pipeline
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
+... })
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}

-pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
-response = pipeline(chat, max_new_tokens=512)
-print(response[0]["generated_text"][-1]["content"])
 ```

-展开下方示例，查看 `Pipeline` 在不同模态与任务中的用法。
+除了给出答案，预训练模型还给出了对应的置信度分数、答案在词符化 (tokenized) 后的文本中开始和结束的位置。你可以从[这个教程](https://huggingface.co/docs/transformers/task_summary)了解更多 `pipeline` API 支持的任务。

-<details>
-<summary>自动语音识别</summary>
+要在你的任务上下载和使用任意预训练模型也很简单，只需三行代码。这里是 PyTorch 版的示例：
+```python
+>>> from transformers import AutoTokenizer, AutoModel

-```py
-from transformers import pipeline
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

-pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
-pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+这里是等效的 TensorFlow 代码：
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
 ```

-</details>
+词符化器 (tokenizer) 为所有的预训练模型提供了预处理，并可以直接对单个字符串进行调用（比如上面的例子）或对列表 (list) 调用。它会输出一个你可以在下游代码里使用或直接通过 `**` 解包表达式传给模型的词典 (dict)。

-<details>
-<summary>图像分类</summary>
+模型本身是一个常规的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)（取决于你的后端），可以常规方式使用。 [这个教程](https://huggingface.co/transformers/training.html)解释了如何将这样的模型整合到经典的 PyTorch 或 TensorFlow 训练循环中，或是如何使用我们的 `Trainer` （训练器）API 来在一个新的数据集上快速微调。

-<h3 align="center">
-    <a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a>
-</h3>
+## 为什么要用 transformers？

-```py
-from transformers import pipeline
+1. 便于使用的先进模型：
+    - NLU 和 NLG 上表现优越
+    - 对教学和实践友好且低门槛
+    - 高级抽象，只需了解三个类
+    - 对所有模型统一的API

-pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer")
-pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
-[{"label": "macaw", "score": 0.997848391532898},
- {"label": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
-  "score": 0.0016551691805943847},
- {"label": "lorikeet", "score": 0.00018523589824326336},
- {"label": "African grey, African gray, Psittacus erithacus",
-  "score": 7.85409429227002e-05},
- {"label": "quail", "score": 5.502637941390276e-05}]
+1. 更低计算开销，更少的碳排放：
+    - 研究人员可以分享已训练的模型而非每次从头开始训练
+    - 工程师可以减少计算用时和生产环境开销
+    - 数十种模型架构、两千多个预训练模型、100多种语言支持
+
+1. 对于模型生命周期的每一个部分都面面俱到：
+    - 训练先进的模型，只需 3 行代码
+    - 模型在不同深度学习框架间任意转移，随你心意
+    - 为训练、评估和生产选择最适合的框架，衔接无缝
+
+1. 为你的需求轻松定制专属模型和用例：
+    - 我们为每种模型架构提供了多个用例来复现原论文结果
+    - 模型内部结构保持透明一致
+    - 模型文件可单独使用，方便修改和快速实验
+
+## 什么情况下我不该用 transformers？
+
+- 本库并不是模块化的神经网络工具箱。模型文件中的代码特意呈若璞玉，未经额外抽象封装，以便研究人员快速迭代修改而不致溺于抽象和文件跳转之中。
+- `Trainer` API 并非兼容任何模型，只为本库之模型优化。若是在寻找适用于通用机器学习的训练循环实现，请另觅他库。
+- 尽管我们已尽力而为，[examples 目录](https://github.com/huggingface/transformers/tree/main/examples)中的脚本也仅为用例而已。对于你的特定问题，它们并不一定开箱即用，可能需要改几行代码以适之。
+
+## 安装
+
+### 使用 pip
+
+这个仓库已在 Python 3.9+、Flax 0.4.1+、PyTorch 2.1+ 和 TensorFlow 2.6+ 下经过测试。
+
+你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境，请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
+
+首先，用你打算使用的版本的 Python 创建一个虚拟环境并激活。
+
+然后，你需要安装 Flax、PyTorch 或 TensorFlow 其中之一。关于在你使用的平台上安装这些框架，请参阅 [TensorFlow 安装页](https://www.tensorflow.org/install/), [PyTorch 安装页](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安装页](https://github.com/google/flax#quick-install)。
+
+当这些后端之一安装成功后， 🤗 Transformers 可依此安装：
+
+```bash
+pip install transformers
 ```

-</details>
+如果你想要试试用例或者想在正式发布前使用最新的开发中代码，你得[从源代码安装](https://huggingface.co/docs/transformers/installation#installing-from-source)。

-<details>
-<summary>视觉问答</summary>
+### 使用 conda

-<h3 align="center">
-    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
-</h3>
+🤗 Transformers 可以通过 conda 依此安装：

-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
-pipeline(
-    image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
-    question="What is in the image?",
-)
-[{"answer": "statue of liberty"}]
+```shell script
+conda install conda-forge::transformers
 ```

-</details>
+> **_笔记:_** 从 `huggingface` 渠道安装 `transformers` 已被废弃。

-## 为什么要用 Transformers？
+要通过 conda 安装 Flax、PyTorch 或 TensorFlow 其中之一，请参阅它们各自安装页的说明。

-1. 易于使用的最先进模型：
-    - 在自然语言理解与生成、计算机视觉、音频、视频与多模态任务上表现优越。
-    - 对研究者、工程师与开发者友好且低门槛。
-    - 少量用户侧抽象，仅需学习三个类。
-    - 统一的 API，使用所有预训练模型体验一致。
+## 模型架构

-1. 更低计算开销与更小碳足迹：
-    - 共享已训练的模型，而非每次从零开始训练。
-    - 减少计算时间与生产环境成本。
-    - 覆盖数十种模型架构，跨所有模态提供 1M+ 预训练检查点。
+🤗 Transformers 支持的[**所有的模型检查点**](https://huggingface.co/models)由[用户](https://huggingface.co/users)和[组织](https://huggingface.co/organizations)上传，均与 huggingface.co [model hub](https://huggingface.co) 无缝整合。

-1. 在模型生命周期的每个阶段都可以选用合适的框架：
-    - 3 行代码即可训练最先进模型。
-    - 在 PyTorch/JAX/TF2.0 间自由迁移同一个模型。
-    - 为训练、评估与生产挑选最合适的框架。
+目前的检查点数量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)

-1. 轻松定制模型或用例：
-    - 为每个架构提供示例以复现原论文结果。
-    - 尽可能一致地暴露模型内部。
-    - 模型文件可独立于库使用，便于快速实验。
+🤗 Transformers 目前支持如下的架构: 模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary).

-<a target="_blank" href="https://huggingface.co/enterprise">
-    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
-</a><br>
+要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现，或其是否在 🤗 Tokenizers 库中有对应词符化器（tokenizer），敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。

-## 为什么我不该用 Transformers？
-
- 该库不是一个可自由拼搭的神经网络模块化工具箱。模型文件中的代码刻意减少额外抽象，以便研究者能快速在各个模型上迭代，而无需深入更多抽象或文件跳转。
- 训练 API 优化用于 Transformers 提供的 PyTorch 模型。若需要通用的机器学习训练循环，请使用其它库，如 [Accelerate](https://huggingface.co/docs/accelerate)。
- [示例脚本](https://github.com/huggingface/transformers/tree/main/examples)只是「示例」。它们不一定能直接适配你的具体用例，需要你进行必要的改动。
+这些实现均已于多个数据集测试（请参看用例脚本）并应于原版实现表现相当。你可以在用例文档的[此节](https://huggingface.co/docs/transformers/examples)中了解表现的细节。


-## 100 个使用 Transformers 的项目
+## 了解更多

-Transformers 不止是一个使用预训练模型的工具包，它还是围绕 Hugging Face Hub 构建的项目社区。我们希望 Transformers 能助力开发者、研究人员、学生、老师、工程师与任何人构建理想项目。
-
-为庆祝 Transformers 获得 100,000 颗星，我们制作了 [awesome-transformers](./awesome-transformers.md) 页面，展示了 100 个由社区构建的优秀项目。
-
-如果你拥有或使用某个项目，认为它应该在列表中出现，欢迎提交 PR 添加它！
-
-## 示例模型
-
-你可以直接在它们的 [Hub 模型页](https://huggingface.co/models) 上测试我们的多数模型。
-
-展开每个模态以查看不同用例中的部分示例模型。
-
-<details>
-<summary>音频</summary>
-
- 使用 [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) 进行音频分类
- 使用 [Moonshine](https://huggingface.co/UsefulSensors/moonshine) 进行自动语音识别
- 使用 [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) 进行关键词检索
- 使用 [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16) 进行语音到语音生成
- 使用 [MusicGen](https://huggingface.co/facebook/musicgen-large) 文本到音频生成
- 使用 [Bark](https://huggingface.co/suno/bark) 文本到语音生成
-
-</details>
-
-<details>
-<summary>计算机视觉</summary>
-
- 使用 [SAM](https://huggingface.co/facebook/sam-vit-base) 自动生成掩码
- 使用 [DepthPro](https://huggingface.co/apple/DepthPro-hf) 进行深度估计
- 使用 [DINO v2](https://huggingface.co/facebook/dinov2-base) 进行图像分类
- 使用 [SuperPoint](https://huggingface.co/magic-leap-community/superpoint) 进行关键点检测
- 使用 [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor) 进行关键点匹配
- 使用 [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd) 进行目标检测
- 使用 [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple) 进行姿态估计
- 使用 [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large) 进行通用分割
- 使用 [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large) 进行视频分类
-
-</details>
-
-<details>
-<summary>多模态</summary>
-
- 使用 [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B) 实现音频或文本到文本
- 使用 [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base) 进行文档问答
- 使用 [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) 实现图像或文本到文本
- 使用 [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) 进行图文描述
- 使用 [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf) 进行基于 OCR 的文档理解
- 使用 [TAPAS](https://huggingface.co/google/tapas-base) 进行表格问答
- 使用 [Emu3](https://huggingface.co/BAAI/Emu3-Gen) 进行统一的多模态理解与生成
- 使用 [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) 视觉到文本
- 使用 [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) 进行视觉问答
- 使用 [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) 进行视觉指代表达分割
-
-</details>
-
-<details>
-<summary>NLP</summary>
-
- 使用 [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base) 进行掩码词填充
- 使用 [Gemma](https://huggingface.co/google/gemma-2-2b) 进行命名实体识别（NER）
- 使用 [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) 进行问答
- 使用 [BART](https://huggingface.co/facebook/bart-large-cnn) 进行摘要
- 使用 [T5](https://huggingface.co/google-t5/t5-base) 进行翻译
- 使用 [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B) 进行文本生成
- 使用 [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B) 进行文本分类
-
-</details>
+| 章节 | 描述 |
+|-|-|
+| [文档](https://huggingface.co/docs/transformers/) | 完整的 API 文档和教程 |
+| [任务总结](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支持的任务 |
+| [预处理教程](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 来为模型准备数据 |
+| [训练和微调](https://huggingface.co/docs/transformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
+| [快速上手：微调和用例脚本](https://github.com/huggingface/transformers/tree/main/examples) | 为各种任务提供的用例脚本 |
+| [模型分享和上传](https://huggingface.co/docs/transformers/model_sharing) | 和社区上传和分享你微调的模型 |
+| [迁移](https://huggingface.co/docs/transformers/migration) | 从 `pytorch-transformers` 或 `pytorch-pretrained-bert` 迁移到 🤗 Transformers |

 ## 引用

--- a/i18n/README_zh-hant.md
+++ b/i18n/README_zh-hant.md
@ -14,6 +14,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->

+<!---
+A useful guide for English-Traditional Chinese translation of Hugging Face documentation
+- Add space around English words and numbers when they appear between Chinese characters. E.g., 共 100 多種語言; 使用 transformers 函式庫。
+- Use square quotes, e.g.,「引用」
+- Some of terms in the file can be found at National Academy for Educational Research (https://terms.naer.edu.tw/), an official website providing bilingual translations between English and Traditional Chinese.
+
+Dictionary
+
+API: API (不翻譯）
+add: 加入
+checkpoint: 檢查點
+code: 程式碼
+community: 社群
+confidence: 信賴度
+dataset: 資料集
+documentation: 文件
+example: 基本翻譯為「範例」，或依語意翻為「例子」
+finetune: 微調
+Hugging Face: Hugging Face（不翻譯）
+implementation: 實作
+inference: 推論
+library: 函式庫
+module: 模組
+NLP/Natural Language Processing: 以 NLP 出現時不翻譯，以 Natural Language Processing 出現時翻譯為自然語言處理
+online demos: 線上Demo
+pipeline: pipeline（不翻譯）
+pretrained/pretrain: 預訓練
+Python data structures (e.g., list, set, dict): 翻譯為串列，集合，字典，並用括號標註原英文
+repository: repository（不翻譯）
+summary: 概覽
+token-: token-（不翻譯）
+Trainer: Trainer（不翻譯）
+transformer: transformer（不翻譯）
+tutorial: 教學
+user: 使用者
+-->
+
 <p align="center">
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
@ -25,7 +62,6 @@ limitations under the License.
 </p>

 <p align="center">
-    <a href="https://huggingface.com/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a>
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
    <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
@ -36,7 +72,7 @@ limitations under the License.

 <h4 align="center">
    <p>
-        <a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
+        <a href="https://github.com/huggingface/transformers/">English</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hans.md">简体中文</a> |
        <b>繁體中文</b> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ko.md">한국어</a> |
@ -44,7 +80,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
@ -57,261 +93,186 @@ limitations under the License.
 </h4>

 <h3 align="center">
-    <p>最先進的預訓練模型，專為推理與訓練而生</p>
+    <p>為 Jax、PyTorch 以及 TensorFlow 打造的先進自然語言處理函式庫</p>
 </h3>

 <h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>

-Transformers 是一個為最先進的機器學習模型（涵蓋文字、電腦視覺、音訊、影片及多模態）提供推理和訓練支援的模型定義框架。
+🤗 Transformers 提供了數以千計的預訓練模型，支援 100 多種語言的文本分類、資訊擷取、問答、摘要、翻譯、文本生成。它的宗旨是讓最先進的 NLP 技術人人易用。

-它將模型定義集中化，使得該定義在整個生態系中能夠達成共識。`transformers` 是貫穿各個框架的樞紐：如果一個模型定義受到支援，它將與大多數訓練框架（如 Axolotl、Unsloth、DeepSpeed、FSDP、PyTorch-Lightning 等）、推理引擎（如 vLLM、SGLang、TGI 等）以及利用 `transformers` 模型定義的周邊建模函式庫（如 llama.cpp、mlx 等）相容。
+🤗 Transformers 提供了便於快速下載和使用的API，讓你可以將預訓練模型用在給定文本、在你的資料集上微調然後經由 [model hub](https://huggingface.co/models) 與社群共享。同時，每個定義的 Python 模組架構均完全獨立，方便修改和快速研究實驗。

-我們致力於支援最新的頂尖模型，並透過使其模型定義變得簡單、可客製化且高效，來普及它們的應用。
+🤗 Transformers 支援三個最熱門的深度學習函式庫： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 並與之完美整合。你可以直接使用其中一個框架訓練你的模型，然後用另一個載入和推論。

-在 [Hugging Face Hub](https://huggingface.com/models) 上，有超過 100 萬個 Transformers [模型檢查點](https://huggingface.co/models?library=transformers&sort=trending) 供您使用。
+## 線上Demo

-立即探索 [Hub](https://huggingface.com/)，尋找合適的模型，並使用 Transformers 幫助您快速上手。
+你可以直接在 [model hub](https://huggingface.co/models) 上測試大多數的模型。我們也提供了 [私有模型託管、模型版本管理以及推論API](https://huggingface.co/pricing)。
+
+這裡是一些範例：
+- [用 BERT 做遮蓋填詞](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 Electra 做專有名詞辨識](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [用 GPT-2 做文本生成](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然語言推論](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [用 DistilBERT 做問答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻譯](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+**[Write With Transformer](https://transformer.huggingface.co)**，由 Hugging Face 團隊所打造，是一個文本生成的官方 demo。
+
+## 如果你在尋找由 Hugging Face 團隊所提供的客製化支援服務
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## 快速上手
+
+我們為快速使用模型提供了 `pipeline` API。 Pipeline 包含了預訓練模型和對應的文本預處理。下面是一個快速使用 pipeline 去判斷正負面情緒的例子：
+
+```python
+>>> from transformers import pipeline
+
+# 使用情緒分析 pipeline
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+第二行程式碼下載並快取 pipeline 使用的預訓練模型，而第三行程式碼則在給定的文本上進行了評估。這裡的答案“正面” (positive) 具有 99.97% 的信賴度。
+
+許多的 NLP 任務都有隨選即用的預訓練 `pipeline`。例如，我們可以輕鬆地從給定文本中擷取問題答案：
+
+``` python
+>>> from transformers import pipeline
+
+# 使用問答 pipeline
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
+... })
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
+
+```
+
+除了提供問題解答，預訓練模型還提供了對應的信賴度分數以及解答在 tokenized 後的文本中開始和結束的位置。你可以從[這個教學](https://huggingface.co/docs/transformers/task_summary)了解更多 `pipeline` API支援的任務。
+
+要在你的任務中下載和使用任何預訓練模型很簡單，只需三行程式碼。這裡是 PyTorch 版的範例：
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+這裡是對應的 TensorFlow 程式碼：
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+Tokenizer 為所有的預訓練模型提供了預處理，並可以直接轉換單一字串（比如上面的例子）或串列 (list)。它會輸出一個的字典 (dict) 讓你可以在下游程式碼裡使用或直接藉由 `**` 運算式傳給模型。
+
+模型本身是一個常規的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)（取決於你的後端），可依常規方式使用。 [這個教學](https://huggingface.co/transformers/training.html)解釋了如何將這樣的模型整合到一般的 PyTorch 或 TensorFlow 訓練迴圈中，或是如何使用我們的 `Trainer` API 在一個新的資料集上快速進行微調。
+
+## 為什麼要用 transformers？
+
+1. 便於使用的先進模型：
+    - NLU 和 NLG 上性能卓越
+    - 對教學和實作友好且低門檻
+    - 高度抽象，使用者只須學習 3 個類別
+    - 對所有模型使用的制式化API
+
+1. 更低的運算成本，更少的碳排放：
+    - 研究人員可以分享已訓練的模型而非每次從頭開始訓練
+    - 工程師可以減少計算時間以及生產成本
+    - 數十種模型架構、兩千多個預訓練模型、100多種語言支援
+
+1. 對於模型生命週期的每一個部分都面面俱到：
+    - 訓練先進的模型，只需 3 行程式碼
+    - 模型可以在不同深度學習框架之間任意轉換
+    - 為訓練、評估和生產選擇最適合的框架，並完美銜接
+
+1. 為你的需求輕鬆客製化專屬模型和範例：
+    - 我們為每種模型架構提供了多個範例來重現原論文結果
+    - 一致的模型內部架構
+    - 模型檔案可單獨使用，便於修改和快速實驗
+
+## 什麼情況下我不該用 transformers？
+
+- 本函式庫並不是模組化的神經網絡工具箱。模型文件中的程式碼並未做額外的抽象封裝，以便研究人員快速地翻閱及修改程式碼，而不會深陷複雜的類別包裝之中。
+- `Trainer` API 並非相容任何模型，它只為本函式庫中的模型最佳化。對於一般的機器學習用途，請使用其他函式庫。
+- 儘管我們已盡力而為，[examples 目錄](https://github.com/huggingface/transformers/tree/main/examples)中的腳本也僅為範例而已。對於特定問題，它們並不一定隨選即用，可能需要修改幾行程式碼以符合需求。

 ## 安裝

-Transformers 支援 Python 3.9+ 和 [PyTorch](https://pytorch.org/get-started/locally/) 2.1+。
+### 使用 pip

-使用 [venv](https://docs.python.org/3/library/venv.html) 或基於 Rust 的高速 Python 套件及專案管理器 [uv](https://docs.astral.sh/uv/) 來建立並啟用虛擬環境。
+這個 Repository 已在 Python 3.9+、Flax 0.4.1+、PyTorch 2.1+ 和 TensorFlow 2.6+ 下經過測試。

-```py
-# venv
-python -m venv .my-env
-source .my-env/bin/activate
-# uv
-uv venv .my-env
-source .my-env/bin/activate
+你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境，請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
+
+首先，用你打算使用的版本的 Python 創建一個虛擬環境並進入。
+
+然後，你需要安裝 Flax、PyTorch 或 TensorFlow 其中之一。對於該如何在你使用的平台上安裝這些框架，請參閱 [TensorFlow 安裝頁面](https://www.tensorflow.org/install/), [PyTorch 安裝頁面](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安裝頁面](https://github.com/google/flax#quick-install)。
+
+當其中一個後端安裝成功後，🤗 Transformers 可依此安裝：
+
+```bash
+pip install transformers
 ```

-在您的虛擬環境中安裝 Transformers。
+如果你想要試試範例或者想在正式發布前使用最新開發中的程式碼，你必須[從原始碼安裝](https://huggingface.co/docs/transformers/installation#installing-from-source)。

-```py
-# pip
-pip install "transformers[torch]"
+### 使用 conda

-# uv
-uv pip install "transformers[torch]"
+🤗 Transformers 可以藉由 conda 依此安裝：
+
+```shell script
+conda install conda-forge::transformers
 ```

-如果您想使用函式庫的最新變更或有興趣參與貢獻，可以從原始碼安裝 Transformers。然而，*最新*版本可能不穩定。如果您遇到任何錯誤，歡迎隨時提交一個 [issue](https://github.com/huggingface/transformers/issues)。
+> **_筆記:_** 從 `huggingface` 頻道安裝 `transformers` 已被淘汰。

-```shell
-git clone https://github.com/huggingface/transformers.git
-cd transformers
+要藉由 conda 安裝 Flax、PyTorch 或 TensorFlow 其中之一，請參閱它們各自安裝頁面的說明。

-# pip
-pip install '.[torch]'
+## 模型架構

-# uv
-uv pip install '.[torch]'
-```
+**🤗 Transformers 支援的[所有的模型檢查點](https://huggingface.co/models)**，由[使用者](https://huggingface.co/users)和[組織](https://huggingface.co/organizations)上傳，均與 huggingface.co [model hub](https://huggingface.co) 完美結合。

-## 快速入門
+目前的檢查點數量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)

-透過 [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API 快速開始使用 Transformers。`Pipeline` 是一個高階的推理類別，支援文字、音訊、視覺和多模態任務。它負責處理輸入資料的預處理，並回傳適當的輸出。
+🤗 Transformers 目前支援以下的架構: 模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary).

-實例化一個 pipeline 並指定用於文字生成的模型。該模型會被下載並快取，方便您之後輕鬆複用。最後，傳入一些文字來提示模型。
+要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作，或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer，敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。

-```py
-from transformers import pipeline
+這些實作均已於多個資料集測試（請參閱範例腳本）並應與原版實作表現相當。你可以在範例文件的[此節](https://huggingface.co/docs/transformers/examples)中了解實作的細節。

-pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B")
-pipeline("the secret to baking a really good cake is ")
-[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}]
-```

-與模型進行聊天，使用模式是相同的。唯一的區別是您需要建構一個您與系統之間的聊天歷史（作為 `Pipeline` 的輸入）。
+## 了解更多

-> [!TIP]
-> 你也可以直接在命令列中與模型聊天。
-> ```shell
-> transformers chat Qwen/Qwen2.5-0.5B-Instruct
-> ```
-
-```py
-import torch
-from transformers import pipeline
-
-chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
-]
-
-pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
-response = pipeline(chat, max_new_tokens=512)
-print(response[0]["generated_text"][-1]["content"])
-```
-
-展開下面的範例，查看 `Pipeline` 如何在不同模態和任務上運作。
-
-<details>
-<summary>自動語音辨識</summary>
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
-pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
-```
-
-</details>
-
-<details>
-<summary>影像分類</summary>
-
-<h3 align="center">
-    <a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a>
-</h3>
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer")
-pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
-[{'label': 'macaw', 'score': 0.997848391532898},
- {'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
-  'score': 0.0016551691805943847},
- {'label': 'lorikeet', 'score': 0.00018523589824326336},
- {'label': 'African grey, African gray, Psittacus erithacus',
-  'score': 7.85409429227002e-05},
- {'label': 'quail', 'score': 5.502637941390276e-05}]
-```
-
-</details>
-
-<details>
-<summary>視覺問答</summary>
-
-<h3 align="center">
-    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
-</h3>
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
-pipeline(
-    image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
-    question="What is in the image?",
-)
-[{'answer': 'statue of liberty'}]
-```
-
-</details>
-
-## 為什麼我應該使用 Transformers？
-
-1.  易於使用的最先進模型：
-    *   在自然語言理解與生成、電腦視覺、音訊、影片和多模態任務上表現卓越。
-    *   為研究人員、工程師與開發者提供了低門檻的入門途徑。
-    *   面向使用者的抽象層級少，只需學習三個核心類別。
-    *   為所有預訓練模型提供了統一的 API 介面。
-
-2.  更低的運算成本，更小的碳足跡：
-    *   分享訓練好的模型，而不是從零開始訓練。
-    *   減少運算時間和生產成本。
-    *   擁有數十種模型架構和超過100萬個橫跨所有模態的預訓練檢查點。
-
-3.  為模型的每個生命週期階段選擇合適的框架：
-    *   僅用3行程式碼即可訓練最先進的模型。
-    *   在PyTorch/JAX/TF2.0框架之間輕鬆切換單一模型。
-    *   為訓練、評估和生產選擇最合適的框架。
-
-4.  輕鬆根據您的需求客製化模型或範例：
-    *   我們為每個架構提供了範例，以重現其原作者發表的結果。
-    *   模型內部結構盡可能保持一致地暴露給使用者。
-    *   模型檔案可以獨立於函式庫使用，便於快速實驗。
-
-<a target="_blank" href="https://huggingface.co/enterprise">
-    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
-</a><br>
-
-## 為什麼我不應該使用 Transformers？
-
-   本函式庫並非一個用於建構神經網路的模組化工具箱。模型檔案中的程式碼為了讓研究人員能快速在模型上迭代，而沒有進行過度的抽象重構，避免了深入額外的抽象層/檔案。
-   訓練 API 針對 Transformers 提供的 PyTorch 模型進行了最佳化。對於通用的機器學習迴圈，您應該使用像 [Accelerate](https://huggingface.co/docs/accelerate) 這樣的其他函式庫。
-   [範例指令稿](https://github.com/huggingface/transformers/tree/main/examples)僅僅是*範例*。它們不一定能在您的特定用例上開箱即用，您可能需要修改程式碼才能使其正常運作。
-
-## 100個使用 Transformers 的專案
-
-Transformers 不僅僅是一個使用預訓練模型的工具包，它還是一個圍繞它和 Hugging Face Hub 建構的專案社群。我們希望 Transformers 能夠賦能開發者、研究人員、學生、教授、工程師以及其他任何人，去建構他們夢想中的專案。
-
-為了慶祝 Transformers 獲得 10 萬顆星標，我們希望透過 [awesome-transformers](./awesome-transformers.md) 頁面來聚焦社群，該頁面列出了100個基於 Transformers 建構的精彩專案。
-
-如果您擁有或使用一個您認為應該被列入其中的專案，請隨時提交 PR 將其加入！
-
-## 範例模型
-
-您可以在我們大多數模型的 [Hub 模型頁面](https://huggingface.co/models) 上直接進行測試。
-
-展開下面的每個模態，查看一些用於不同用例的範例模型。
-
-<details>
-<summary>音訊</summary>
-
-   Audio classification with [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo)
-   Automatic speech recognition with [Moonshine](https://huggingface.co/UsefulSensors/moonshine)
-   Keyword spotting with [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
-   Speech to speech generation with [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16)
-   Text to audio with [MusicGen](https://huggingface.co/facebook/musicgen-large)
-   Text to speech with [Bark](https://huggingface.co/suno/bark)
-
-</details>
-
-<details>
-<summary>電腦視覺</summary>
-
-   Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
-   Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
-   Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
-   Keypoint detection with [SuperPoint](https://huggingface.co/magic-leap-community/superpoint)
-   Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
-   Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
-   Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
-   Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
-   Video classification with [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large)
-
-</details>
-
-<details>
-<summary>多模態</summary>
-
-   Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B)
-   Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base)
-   Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
-   Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b)
-   OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
-   Table question answering with [TAPAS](https://huggingface.co/google/tapas-base)
-   Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen)
-   Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)
-   Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
-   Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224)
-
-</details>
-
-<details>
-<summary>自然語言處理 (NLP)</summary>
-
-   Masked word completion with [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base)
-   Named entity recognition with [Gemma](https://huggingface.co/google/gemma-2-2b)
-   Question answering with [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
-   Summarization with [BART](https://huggingface.co/facebook/bart-large-cnn)
-   Translation with [T5](https://huggingface.co/google-t5/t5-base)
-   Text generation with [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B)
-   Text classification with [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B)
-
-</details>
+| 章節 | 描述 |
+|-|-|
+| [文件](https://huggingface.co/transformers/) | 完整的 API 文件和教學 |
+| [任務概覽](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支援的任務 |
+| [預處理教學](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 來為模型準備資料 |
+| [訓練和微調](https://huggingface.co/docs/transformers/training) | 使用 PyTorch/TensorFlow 的內建的訓練方式或於 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
+| [快速上手：微調和範例腳本](https://github.com/huggingface/transformers/tree/main/examples) | 為各種任務提供的範例腳本 |
+| [模型分享和上傳](https://huggingface.co/docs/transformers/model_sharing) | 上傳並與社群分享你微調的模型 |
+| [遷移](https://huggingface.co/docs/transformers/migration) | 從 `pytorch-transformers` 或 `pytorch-pretrained-bert` 遷移到 🤗 Transformers |

 ## 引用

-現在我們有一篇可供您引用的關於 🤗 Transformers 函式庫的 [論文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)：
+我們已將此函式庫的[論文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)正式發表。如果你使用了 🤗 Transformers 函式庫，可以引用：
 ```bibtex
@inproceedings{wolf-etal-2020-transformers,
    title = "Transformers: State-of-the-Art Natural Language Processing",
@ -324,4 +285,4 @@ Transformers 不僅僅是一個使用預訓練模型的工具包，它還是一
    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
    pages = "38--45"
 }
-```
+```
--- a/setup.py
+++ b/setup.py
@ -137,8 +137,8 @@ _deps = [
    "psutil",
    "pyyaml>=5.1",
    "pydantic>=2",
-    "pytest>=7.2.0,<9.0.0",
-    "pytest-asyncio>=1.2.0",
+    "pytest>=7.2.0",
+    "pytest-asyncio",
    "pytest-rerunfailures<16.0",
    "pytest-timeout",
    "pytest-xdist",
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@ -723,7 +723,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):

            if self.mask_replace_prob < 1:
                warnings.warn(
-                    "Random token replacement is not supported with whole word masking. "
+                    "Random token replacement is not supported with whole word masking.",
                    "Setting mask_replace_prob to 1.",
                )
                self.mask_replace_prob = 1
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@ -82,7 +82,7 @@ class GlueDataset(Dataset):
        cache_dir: Optional[str] = None,
    ):
        warnings.warn(
-            "This dataset will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
+            "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
            "library. You can have a look at this example script for pointers: "
            "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py",
            FutureWarning,
--- a/src/transformers/data/metrics/init.py
+++ b/src/transformers/data/metrics/init.py
@ -21,7 +21,7 @@ if is_sklearn_available():


 DEPRECATION_WARNING = (
-    "This metric will be removed from the library soon, metrics should be handled with the Hugging Face Evaluate "
+    "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
    "library. You can have a look at this example script for pointers: "
    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
 )
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@ -28,7 +28,7 @@ from .utils import DataProcessor, InputExample, InputFeatures
 logger = logging.get_logger(__name__)

 DEPRECATION_WARNING = (
-    "This {0} will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
+    "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
    "library. You can have a look at this example script for pointers: "
    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
 )
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -47,8 +47,8 @@ deps = {
    "psutil": "psutil",
    "pyyaml": "pyyaml>=5.1",
    "pydantic": "pydantic>=2",
-    "pytest": "pytest>=7.2.0,<9.0.0",
-    "pytest-asyncio": "pytest-asyncio>=1.2.0",
+    "pytest": "pytest>=7.2.0",
+    "pytest-asyncio": "pytest-asyncio",
    "pytest-rerunfailures": "pytest-rerunfailures<16.0",
    "pytest-timeout": "pytest-timeout",
    "pytest-xdist": "pytest-xdist",
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@ -39,7 +39,6 @@ from .utils import (
    is_torch_dtype,
    logging,
    requires_backends,
-    safe_load_json_file,
 )
 from .utils.hub import cached_file

@ -428,42 +427,35 @@ class FeatureExtractionMixin(PushToHubMixin):
            feature_extractor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
        if os.path.isfile(pretrained_model_name_or_path):
            resolved_feature_extractor_file = pretrained_model_name_or_path
-            resolved_processor_file = None
            is_local = True
        elif is_remote_url(pretrained_model_name_or_path):
            feature_extractor_file = pretrained_model_name_or_path
-            resolved_processor_file = None
            resolved_feature_extractor_file = download_url(pretrained_model_name_or_path)
        else:
            feature_extractor_file = FEATURE_EXTRACTOR_NAME
            try:
                # Load from local folder or from cache or download from model Hub and cache
-                resolved_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=PROCESSOR_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
-                resolved_feature_extractor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=feature_extractor_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
+                resolved_feature_extractor_files = [
+                    resolved_file
+                    for filename in [feature_extractor_file, PROCESSOR_NAME]
+                    if (
+                        resolved_file := cached_file(
+                            pretrained_model_name_or_path,
+                            filename=filename,
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            proxies=proxies,
+                            local_files_only=local_files_only,
+                            subfolder=subfolder,
+                            token=token,
+                            user_agent=user_agent,
+                            revision=revision,
+                            _raise_exceptions_for_missing_entries=False,
+                        )
+                    )
+                    is not None
+                ]
+                resolved_feature_extractor_file = resolved_feature_extractor_files[0]
            except OSError:
                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                # the original exception.
@ -477,24 +469,19 @@ class FeatureExtractionMixin(PushToHubMixin):
                    f" directory containing a {FEATURE_EXTRACTOR_NAME} file"
                )

-        # Load feature_extractor dict. Priority goes as (nested config if found -> image processor config)
-        # We are downloading both configs because almost all models have a `processor_config.json` but
-        # not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
-        feature_extractor_dict = None
-        if resolved_processor_file is not None:
-            processor_dict = safe_load_json_file(resolved_processor_file)
-            if "feature_extractor" in processor_dict or "audio_processor" in processor_dict:
-                feature_extractor_dict = processor_dict.get("feature_extractor", processor_dict.get("audio_processor"))
+        try:
+            # Load feature_extractor dict
+            with open(resolved_feature_extractor_file, encoding="utf-8") as reader:
+                text = reader.read()
+            feature_extractor_dict = json.loads(text)
+            if "audio_processor" in feature_extractor_dict:
+                feature_extractor_dict = feature_extractor_dict["audio_processor"]
+            else:
+                feature_extractor_dict = feature_extractor_dict.get("feature_extractor", feature_extractor_dict)

-        if resolved_feature_extractor_file is not None and feature_extractor_dict is None:
-            feature_extractor_dict = safe_load_json_file(resolved_feature_extractor_file)
-
-        if feature_extractor_dict is None:
+        except json.JSONDecodeError:
            raise OSError(
-                f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load"
-                " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                f" directory containing a {feature_extractor_file} file"
+                f"It looks like the config file at '{resolved_feature_extractor_file}' is not a valid JSON file."
            )

        if is_local:
--- a/src/transformers/generation/continuous_batching/cache.py
+++ b/src/transformers/generation/continuous_batching/cache.py
@ -189,9 +189,7 @@ class PagedAttentionCache:
        num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens(
            num_blocks=getattr(generation_config, "num_blocks", None),
            max_batch_tokens=getattr(generation_config, "max_batch_tokens", None),
-            max_memory_percent=getattr(
-                generation_config, "max_memory", 0.8
-            ),  # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI
+            max_memory_percent=getattr(generation_config, "max_memory", 0.9),
            cache_dtype=self.dtype,
        )

@ -416,7 +414,7 @@ class PagedAttentionMemoryHandler:
        self,
        num_blocks: Optional[int] = None,
        max_batch_tokens: Optional[int] = None,
-        max_memory_percent: float = 0.8,  # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI
+        max_memory_percent: float = 0.9,
        cache_dtype: torch.dtype = torch.float16,
    ) -> tuple[int, int]:
        """Determine optimal number of blocks and maximum number of tokens per batch based on available memory and
@ -456,7 +454,7 @@ class PagedAttentionMemoryHandler:

    def compute_num_blocks_and_max_batch_tokens(
        self,
-        max_memory_percent: float,
+        max_memory_percent: float = 0.9,
        cache_dtype: torch.dtype = torch.float16,
        m: float = 0.01,
    ) -> tuple[int, int]:
@ -505,7 +503,7 @@ class PagedAttentionMemoryHandler:
    def compute_max_batch_tokens(
        self,
        num_blocks: int,
-        max_memory_percent: float,
+        max_memory_percent: float = 0.9,
        cache_dtype: torch.dtype = torch.float16,
    ) -> int:
        """Calculate maximum batch tokens M given a fixed number of cache blocks. The formula for M is given by:
@ -533,7 +531,7 @@ class PagedAttentionMemoryHandler:
    def compute_num_blocks(
        self,
        max_batch_tokens: int,
-        max_memory_percent: float,
+        max_memory_percent: float = 0.9,
        cache_dtype: torch.dtype = torch.float16,
    ) -> int:
        """Calculate number of cache blocks N given a fixed maximum token per token M. The formula for N is given by:
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@ -826,8 +826,6 @@ class ContinuousBatchingManager:
        if block:
            self.join(stop_trigger_time, timeout)

-        self.batch_processor = None
-
    def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None:
        """Wait for the background thread to finish.

--- a/src/transformers/generation/continuous_batching/requests.py
+++ b/src/transformers/generation/continuous_batching/requests.py
@ -19,7 +19,6 @@ from typing import Optional

 import torch

-from ...utils import is_torch_xpu_available
 from ...utils.logging import logging
 from ...utils.metrics import traced

@ -36,13 +35,6 @@ def get_device_and_memory_breakdown() -> tuple[torch.device, int, int, int]:
        total_memory = torch.cuda.get_device_properties(device).total_memory
        reserved_memory = torch.cuda.memory_reserved(device)
        allocated_memory = torch.cuda.memory_allocated(device)
-    elif is_torch_xpu_available():
-        device = torch.device("xpu")
-        torch.xpu.empty_cache()
-        torch.xpu.synchronize()
-        total_memory = torch.xpu.get_device_properties(device).total_memory
-        reserved_memory = torch.xpu.memory_reserved(device)
-        allocated_memory = torch.xpu.memory_allocated(device)
    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = torch.device("mps")
        # MPS memory reporting (PyTorch 2.0+)
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -608,7 +608,7 @@ class GenerationMixin(ContinuousMixin):
        use_cache = kwargs.get("use_cache")
        if use_cache is None:
            use_cache = getattr(self.config, "use_cache", False)
-        if past_key_values is not None or use_cache:
+        if past_key_values is None or use_cache:
            # TODO (joao): handle the case where cache length == input_ids length. The function below results in an
            # exception because we get empty input_ids after slicing. In essence, we need to roll back the cache 1
            # token to recompute the logits for the first token to be generated (but not all caches support roll backs)
@ -2170,7 +2170,7 @@ class GenerationMixin(ContinuousMixin):
            return False

        # Base logic
-        valid_hardware = self.device.type in ["cuda", "xpu"] or bool(
+        valid_hardware = self.device.type == "cuda" or bool(
            generation_config.compile_config is not None and generation_config.compile_config._compile_all_devices
        )
        using_compilable_cache = (
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@ -32,7 +32,6 @@ from .utils import (
    is_offline_mode,
    is_remote_url,
    logging,
-    safe_load_json_file,
 )
 from .utils.hub import cached_file

@ -281,41 +280,35 @@ class ImageProcessingMixin(PushToHubMixin):
            image_processor_file = os.path.join(pretrained_model_name_or_path, image_processor_filename)
        if os.path.isfile(pretrained_model_name_or_path):
            resolved_image_processor_file = pretrained_model_name_or_path
-            resolved_processor_file = None
            is_local = True
        elif is_remote_url(pretrained_model_name_or_path):
            image_processor_file = pretrained_model_name_or_path
-            resolved_processor_file = None
            resolved_image_processor_file = download_url(pretrained_model_name_or_path)
        else:
            image_processor_file = image_processor_filename
            try:
-                resolved_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=PROCESSOR_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
-                resolved_image_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=image_processor_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_image_processor_files = [
+                    resolved_file
+                    for filename in [image_processor_file, PROCESSOR_NAME]
+                    if (
+                        resolved_file := cached_file(
+                            pretrained_model_name_or_path,
+                            filename=filename,
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            proxies=proxies,
+                            local_files_only=local_files_only,
+                            token=token,
+                            user_agent=user_agent,
+                            revision=revision,
+                            subfolder=subfolder,
+                            _raise_exceptions_for_missing_entries=False,
+                        )
+                    )
+                    is not None
+                ]
+                resolved_image_processor_file = resolved_image_processor_files[0]
            except OSError:
                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                # the original exception.
@ -329,24 +322,16 @@ class ImageProcessingMixin(PushToHubMixin):
                    f" directory containing a {image_processor_filename} file"
                )

-        # Load image_processor dict. Priority goes as (nested config if found -> image processor config)
-        # We are downloading both configs because almost all models have a `processor_config.json` but
-        # not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
-        image_processor_dict = None
-        if resolved_processor_file is not None:
-            processor_dict = safe_load_json_file(resolved_processor_file)
-            if "image_processor" in processor_dict:
-                image_processor_dict = processor_dict["image_processor"]
+        try:
+            # Load image_processor dict
+            with open(resolved_image_processor_file, encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+            image_processor_dict = image_processor_dict.get("image_processor", image_processor_dict)

-        if resolved_image_processor_file is not None and image_processor_dict is None:
-            image_processor_dict = safe_load_json_file(resolved_image_processor_file)
-
-        if image_processor_dict is None:
+        except json.JSONDecodeError:
            raise OSError(
-                f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
-                " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                f" directory containing a {image_processor_filename} file"
+                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
            )

        if is_local:
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -821,26 +821,14 @@ def split_to_tiles(images: "torch.Tensor", num_tiles_height: int, num_tiles_widt
    return image


+def _cast_tensor_to_float(x):
+    if x.is_floating_point():
+        return x
+    return x.float()
+
+
 def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = False):
-    """
-    Helper function to flatten a single level of nested image and batch structures and group by shape.
-    Args:
-        nested_images (list):
-            A list of images or a single tensor
-        paired_inputs (Any, *optional*):
-            Zero or more lists that mirror the structure of `nested_images` (flat list, or list of lists when
-            `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
-            same shape key. These paired values are grouped alongside `nested_images` but are not stacked in the output, so
-            they do not need to be tensors.
-        is_nested (bool, *optional*, defaults to False):
-            Whether the images are nested.
-    Returns:
-        tuple[dict, ...]:
-            - A dictionary with shape as key and list of images with that shape as value
-            - A dictionary with shape as key and list of paired values with that shape as value
-            - A dictionary mapping original indices to (shape, index) tuples
-            - A dictionary mapping original indices to (shape, index) tuples for each paired input
-    """
+    """Helper function to flatten a single level of nested image and batch structures and group by shape."""
    grouped_images = defaultdict(list)
    grouped_images_index = {}
    paired_grouped_values = [defaultdict(list) for _ in paired_inputs]
@ -892,20 +880,27 @@ def _reconstruct_nested_structure(indices, processed_images):
    return result


-def _iterate_items(items, is_nested: bool):
-    """
-    Helper function to iterate over items yielding (key, item) pairs.
+def _disable_grouping_output_nested(images, *paired_inputs):
+    """Build the disable_grouping output tuple for a single-level nested structure."""
+    outer_range = range(len(images))
+    inner_ranges = [range(len(images[i])) for i in outer_range]

-    For nested structures, yields ((row_index, col_index), item).
-    For flat structures, yields (index, item).
-    """
-    if is_nested:
-        for i, row in enumerate(items):
-            for j, item in enumerate(row):
-                yield (i, j), item
-    else:
-        for i, item in enumerate(items):
-            yield i, item
+    # Precompute all (i, j) pairs
+    ij_pairs = [(i, j) for i in outer_range for j in inner_ranges[i]]
+
+    images_dict = {(i, j): images[i][j].unsqueeze(0) for (i, j) in ij_pairs}
+    paired_dicts = [{(i, j): paired_list[i][j].unsqueeze(0) for (i, j) in ij_pairs} for paired_list in paired_inputs]
+    index_map = {(i, j): ((i, j), 0) for (i, j) in ij_pairs}
+    return images_dict, *paired_dicts, index_map
+
+
+def _disable_grouping_output_flat(images, *paired_inputs):
+    """Build the disable_grouping output tuple for a flat list structure."""
+    idx_range = range(len(images))
+    images_dict = {i: images[i].unsqueeze(0) for i in idx_range}
+    paired_dicts = [{i: paired_list[i].unsqueeze(0) for i in idx_range} for paired_list in paired_inputs]
+    index_map = {i: (i, 0) for i in idx_range}
+    return images_dict, *paired_dicts, index_map


 def group_images_by_shape(
@ -925,7 +920,7 @@ def group_images_by_shape(
    Args:
        images (Union[list["torch.Tensor"], "torch.Tensor"]):
            A list of images or a single tensor
-        paired_inputs (Any, *optional*):
+        *paired_inputs (Any):
            Zero or more lists that mirror the structure of `images` (flat list, or list of lists when
            `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
            same shape key. These paired values are grouped alongside `images` but are not stacked in the output, so
@ -949,14 +944,10 @@ def group_images_by_shape(
        disable_grouping = device == "cpu"

    if disable_grouping:
-        return (
-            {key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
-            *[
-                {key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
-                for paired_list in paired_inputs
-            ],
-            {key: (key, 0) for key, _ in _iterate_items(images, is_nested)},
-        )
+        if is_nested:
+            return _disable_grouping_output_nested(images, *paired_inputs)
+        else:
+            return _disable_grouping_output_flat(images, *paired_inputs)

    # Handle single level nested structure
    grouped_images, *paired_grouped_values, grouped_images_index = _group_images_by_shape(
@ -999,3 +990,14 @@ def reorder_images(
        ]

    return _reconstruct_nested_structure(grouped_images_index, processed_images)
+
+
+class NumpyToTensor:
+    """
+    Convert a numpy array to a PyTorch tensor.
+    """
+
+    def __call__(self, image: np.ndarray):
+        # Same as in PyTorch, we assume incoming numpy images are in HWC format
+        # c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
+        return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@ -11,6 +11,7 @@
 # specific language governing permissions and limitations under the License.

 import logging
+from collections.abc import Callable
 from typing import Optional

 import torch
@ -23,7 +24,13 @@ from ..cache_utils import (
    StaticCache,
 )
 from ..generation.configuration_utils import GenerationConfig
-from ..modeling_utils import PreTrainedModel
+from ..masking_utils import (
+    ALL_MASK_ATTENTION_FUNCTIONS,
+    _ignore_causal_mask_sdpa,
+    _is_torch_greater_or_equal_than_2_5,
+    prepare_padding_mask,
+)
+from ..modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ..pytorch_utils import (
    is_torch_greater_or_equal,
    is_torch_greater_or_equal_than_2_3,
@ -222,6 +229,10 @@ class TorchExportableModuleForDecoderOnlyLM(torch.nn.Module):
                "Using `StaticCache` for export as `layer_types` is not specified or `sliding_window` is `null` in the config."
            )
            self.model = TorchExportableModuleWithStaticCache(model, batch_size, max_cache_len, device)
+        # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+        ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+        ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+        self.model.model.config._attn_implementation = "sdpa_without_vmap"

    def forward(
        self,
@ -757,6 +768,11 @@ def convert_and_export_with_cache(

    import torch.export._trace

+    # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+    ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+    ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+    model.config._attn_implementation = "sdpa_without_vmap"
+
    with torch.no_grad():
        # TODO: The default inputs only work for text models. We need to add support for vision/audio models.
        example_input_ids = (
@ -1020,6 +1036,11 @@ def export_with_dynamic_cache(
    if not is_torch_greater_or_equal_than_2_3:
        raise ImportError("torch >= 2.3 is required.")

+    # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+    ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+    ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+    model.config._attn_implementation = "sdpa_without_vmap"
+
    register_dynamic_cache_export_support()

    with torch.no_grad():
@ -1088,3 +1109,92 @@ def _unflatten_dynamic_cache(values, context: torch.utils._pytree.Context):
        value = value_list[idx] if idx < len(value_list) else None
        cache.update(key, value, idx)
    return cache
+
+
+def sdpa_mask_without_vmap(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Optional[Callable] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """
+    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
+    the element should take part in the attention computation, and False that it should not.
+
+    This is similar to `masking_utils.sdpa_mask` but does not use `vmap` which is incompatible with export.
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+
+    """
+
+    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
+    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+
+    #  Under specific conditions, we can avoid materializing the mask, instead relying on the `is_causal` argument
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, local_size):
+        return None
+
+    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
+    # but without data-dependent slicing (i.e. torch.compile friendly)
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset
+    reshaped_cache_position = cache_position.view(-1, 1)
+
+    # This is a bit hacky to know what pattern we are using, but all mask creation function actually forward
+    # the config through kwargs anyway, so it allows to rely on it
+    # Usually, the `mask_function` is the only entry-point to define the pattern - we could do for loops over it,
+    # but this is more efficient
+    sliding_window = getattr(kwargs["config"], "sliding_window", None)
+    chunk_size = getattr(kwargs["config"], "attention_chunk_size", None)
+
+    if sliding_window is not None and chunk_size is not None:
+        raise ValueError("Cannot use both `sliding_window` and `attention_chunk_size`")
+
+    # Simplest and most efficient way to obtain a causal mask
+    causal_mask = kv_arange <= reshaped_cache_position
+    # If using sliding window, add the sliding mask
+    if sliding_window is not None:
+        sliding_mask_overlay = kv_arange > reshaped_cache_position - sliding_window
+        causal_mask *= sliding_mask_overlay
+    # If using chunk attention, add the chunked mask
+    elif chunk_size is not None:
+        chunked_mask_overlay = kv_arange // chunk_size == reshaped_cache_position // chunk_size
+        causal_mask *= chunked_mask_overlay
+
+    causal_mask = causal_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
+    if padding_mask is not None:
+        causal_mask = causal_mask * padding_mask[:, None, None, :]
+
+    # Due to a bug in some older torch version, we need to update the mask in case a query is not attending to any
+    # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
+    if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
+        causal_mask |= torch.all(~causal_mask, dim=-1, keepdim=True)
+    return causal_mask
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import re
 from collections.abc import Callable
 from functools import partial
@ -19,7 +18,7 @@ from types import ModuleType
 from typing import Optional, Union

 from ..modeling_flash_attention_utils import lazy_import_flash_attention
-from ..utils import ENV_VARS_TRUE_VALUES, logging
+from ..utils import logging
 from ..utils.import_utils import is_kernels_available
 from .flash_attention import flash_attention_forward

@ -34,22 +33,10 @@ try:
        get_kernel,
        register_kernel_mapping,
        replace_kernel_forward_from_hub,
+        use_kernel_forward_from_hub,
    )

-    _TRANSFORMERS_USE_HUB_KERNELS = os.environ.get("USE_HUB_KERNELS", "YES").upper()
    _kernels_available = True
-    _kernels_enabled = _TRANSFORMERS_USE_HUB_KERNELS in ENV_VARS_TRUE_VALUES
-
-    def use_kernel_forward_from_hub(layer_name: str):
-        if _kernels_enabled:
-            from kernels import use_kernel_forward_from_hub as _kernels_use_kernel_forward_from_hub
-
-            return _kernels_use_kernel_forward_from_hub(layer_name)
-        else:
-            logger.warning_once(
-                f"kernels hub usage is disabled through the environment USE_HUB_KERNELS={_TRANSFORMERS_USE_HUB_KERNELS}"
-            )
-            return lambda cls: cls

    _KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
        "MultiScaleDeformableAttention": {
@ -84,12 +71,6 @@ try:
                    layer_name="RMSNorm",
                )
            },
-            "npu": {
-                Mode.INFERENCE: LayerRepository(
-                    repo_id="kernels-community/liger_kernels",
-                    layer_name="LigerRMSNorm",
-                )
-            },
        },
        "MLP": {
            "cuda": LayerRepository(
@ -180,7 +161,6 @@ try:

 except ImportError:
    _kernels_available = False
-    _kernels_enabled = False

    # Stub to make decorators int transformers work when `kernels`
    # is not installed.
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@ -38,7 +38,7 @@ from transformers.utils.import_utils import _is_package_available


 if os.getenv("WANDB_MODE") == "offline":
-    print("[INFO] Running in WANDB offline mode")
+    print("⚙️  Running in WANDB offline mode")

 from .. import PreTrainedModel, TrainingArguments
 from .. import __version__ as version
--- a/src/transformers/integrations/mxfp4.py
+++ b/src/transformers/integrations/mxfp4.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from ..utils import is_accelerate_available, is_torch_available, is_torch_xpu_available, logging
+from ..utils import is_accelerate_available, is_torch_available, logging


 if is_torch_available():
@ -114,9 +114,6 @@ def convert_moe_packed_tensors(
    if not blocks.is_cuda and torch.cuda.is_available():
        blocks = blocks.cuda()
        scales = scales.cuda()
-    elif (blocks.device.type != "xpu") and is_torch_xpu_available():
-        blocks = blocks.to("xpu")
-        scales = scales.to("xpu")

    scales = scales.to(torch.int32) - 127  # TODO that's because 128=2**7

@ -354,8 +351,6 @@ def dequantize(module, param_name, param_value, target_device, dq_param_name, **
                dequantized = convert_moe_packed_tensors(getattr(module, blocks_attr), getattr(module, scales_attr))
                if target_device == "cpu" and torch.cuda.is_available():
                    torch.cuda.empty_cache()
-                elif target_device == "cpu" and is_torch_xpu_available():
-                    torch.xpu.empty_cache()
                setattr(module, proj, torch.nn.Parameter(dequantized.to(target_device)))
                delattr(module, blocks_attr)
                delattr(module, scales_attr)
@ -400,7 +395,7 @@ def load_and_swizzle_mxfp4(module, param_name, param_value, target_device, trito
        else:
            blocks = blocks.reshape(local_experts, -1, module.intermediate_size // 2)
        if getattr(target_device, "type", target_device) == "cpu":
-            target_device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+            target_device = "cuda"
        blocks = blocks.to(target_device).contiguous()
        scales = scales.to(target_device).contiguous()
        with on_device(target_device):
--- a/src/transformers/integrations/sdpa_attention.py
+++ b/src/transformers/integrations/sdpa_attention.py
@ -63,6 +63,9 @@ def sdpa_attention_forward(
        else:
            sdpa_kwargs = {"enable_gqa": True}

+    if attention_mask is not None and attention_mask.ndim == 4:
+        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+
    # Instead of relying on the value set in the module directly, we use the is_causal passed in kwargs if it is presented
    is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)

--- a/src/transformers/masking_utils.py
+++ b/src/transformers/masking_utils.py
@ -82,10 +82,8 @@ def causal_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int)
 def bidirectional_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
    """
    This creates a full bidirectional mask.
-
-    NOTE: It is important to keep an index-based version for non-vmap expansion.
    """
-    return q_idx >= 0
+    return q_idx.new_ones((), dtype=torch.bool)


 def sliding_window_overlay(sliding_window: int) -> Callable:
@ -112,6 +110,18 @@ def chunked_overlay(chunk_size: int, left_padding: torch.Tensor) -> Callable:
    return inner_mask


+def _legacy_chunked_overlay(chunk_size: int) -> Callable:
+    """
+    Same as the above function, but do not correctly account for left padding tokens.
+    Only kept for compatibility with older torch versions (< 2.6).
+    """
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return kv_idx // chunk_size == q_idx // chunk_size
+
+    return inner_mask
+
+
 def sliding_window_causal_mask_function(sliding_window: int) -> Callable:
    """
    This return the mask_function function to create a sliding window mask.
@ -123,6 +133,8 @@ def chunked_causal_mask_function(chunk_size: int, left_padding: torch.Tensor) ->
    """
    This return the mask_function function to create a chunked attention mask.
    """
+    if not _is_torch_greater_or_equal_than_2_6:
+        return and_masks(_legacy_chunked_overlay(chunk_size), causal_mask_function)
    return and_masks(chunked_overlay(chunk_size, left_padding), causal_mask_function)


@ -163,56 +175,55 @@ def add_offsets_to_mask_function(mask_function: Callable, q_offset: int, kv_offs
    return inner_mask


+def _vmap_for_bhqkv(mask_function: Callable, bh_indices: bool = True) -> Callable:
+    """
+    Used to vmap our mask_functions over the q_idx and kv_idx dimensions of the inputs. Optionally, vmap over
+    the batch and head indices as well if `bh_indices=True`.
+    Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive
+    functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different).
+
+    Args:
+        mask_function (`Callable`):
+            The mask_function to vmap.
+        bh_indices (`bool`, optional):
+            Whether to vmap over the batch and head indices as well, or only q and kv indices.
+
+    Returns:
+        Callable: The vmapped function.
+    """
+    # We vmap the function 2 times, broadcasting the [q_idx, kv_idx] dimensions
+    dimensions = [(None, None, None, 0), (None, None, 0, None)]
+    if bh_indices:
+        # We extend broadcasting over the [batch_idx, head_idx] dimensions
+        dimensions.extend([(None, 0, None, None), (0, None, None, None)])
+
+    for dims in dimensions:
+        mask_function = torch.vmap(mask_function, in_dims=dims, out_dims=0)
+    return mask_function
+
+
 def prepare_padding_mask(
-    attention_mask: Optional[torch.Tensor], kv_length: int, kv_offset: int
+    attention_mask: Optional[torch.Tensor], kv_length: int, kv_offset: int, _slice: bool = True
 ) -> Optional[torch.Tensor]:
    """
-    From the 2D attention mask, prepare the correct padding mask to use by potentially padding it.
+    From the 2D attention mask, prepare the correct padding mask to use by potentially padding it, and slicing
+    according to the `kv_offset` if `_slice` is `True`.
    """
    local_padding_mask = attention_mask
    if attention_mask is not None:
        # Pad it if necessary
        if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0:
            local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length))
+        # For flex, we should not slice them, only use an offset
+        if _slice:
+            # Equivalent to: `local_padding_mask = attention_mask[:, kv_offset : kv_offset + kv_length]`,
+            # but without data-dependent slicing (i.e. torch.compile friendly)
+            mask_indices = torch.arange(kv_length, device=local_padding_mask.device)
+            mask_indices += kv_offset
+            local_padding_mask = local_padding_mask[:, mask_indices]
    return local_padding_mask


-def _can_skip_causal_mask_xpu(
-    padding_mask: Optional[torch.Tensor],
-    query_length: int,
-    kv_length: int,
-    local_attention_size: Optional[int],
-) -> bool:
-    """
-    XPU-specific logic for determining if we can skip causal mask creation.
-
-    For XPU devices, we have special handling:
-    - Single query tokens (query_length == 1) use the same logic as CUDA
-    - Multi-query tokens can skip if padding_mask is provided and correctly structured
-      The mask must have all True values in the query window and all False after
-    """
-
-    if is_tracing(padding_mask):
-        return False
-
-    # Check local attention constraint (same as CUDA)
-    if local_attention_size is not None and kv_length >= local_attention_size:
-        return False
-
-    if padding_mask is None:
-        # Without padding mask, can skip if single query token or full causal attention
-        return query_length == 1 or kv_length == query_length
-
-    # XPU allows skipping under additional conditions when padding_mask is provided
-    if query_length == 1:
-        # Single query token: skip only if no padding tokens present
-        return padding_mask.all()
-
-    # XPU-specific: check if query window is all True and rest is all False
-    # This allows XPU to optimize the 1st token in static cache
-    return padding_mask[:, :query_length].all() and not padding_mask[:, query_length:].any()
-
-
 def _ignore_causal_mask_sdpa(
    padding_mask: Optional[torch.Tensor],
    query_length: int,
@ -233,12 +244,6 @@ def _ignore_causal_mask_sdpa(
        mask_indices += kv_offset
        padding_mask = padding_mask[:, mask_indices]

-    if _is_torch_xpu_available:
-        # XPU devices have special handling for mask skipping:
-        # - Single query tokens use the same logic as CUDA
-        # - Multi-query tokens can skip if padding_mask is provided and correctly structured
-        #   (all True in query window, all False after)
-        return _can_skip_causal_mask_xpu(padding_mask, query_length, kv_length, local_attention_size)
    # When using `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is
    # hard-coded to the forward. If a user exports a model with query_length > 1, the exported model will hard-code `is_causal=True`
    # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). Thus, we only set
@ -246,11 +251,18 @@ def _ignore_causal_mask_sdpa(
    if (
        not is_tracing(padding_mask)
        # only cases when lower and upper diags are the same, see https://github.com/pytorch/pytorch/issues/108108
-        and (query_length == 1 or kv_length == query_length)
+        and (query_length == 1 or (kv_length == query_length or _is_torch_xpu_available))
        # in this case we need to add special patterns to the mask so cannot be skipped otherwise
        and (local_attention_size is None or kv_length < local_attention_size)
        # In this case, we need to add padding to the mask, so cannot be skipped otherwise
-        and (padding_mask is None or padding_mask.all())
+        and (
+            padding_mask is None
+            or (
+                padding_mask.all()
+                if not _is_torch_xpu_available or query_length == 1
+                else padding_mask[:, :query_length].all()
+            )
+        )
    ):
        return True

@ -270,39 +282,7 @@ def _ignore_bidirectional_mask_sdpa(padding_mask: Optional[torch.Tensor]) -> boo
    return False


-def _vmap_expansion_sdpa(mask_function: Callable) -> Callable:
-    """
-    Used to vmap our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
-    Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive
-    functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different).
-    """
-    # We vmap the function over all 4 dimensions, broadcasting [b_idx, h_idx, q_idx, kv_idx]
-    dimensions = [(None, None, None, 0), (None, None, 0, None), (None, 0, None, None), (0, None, None, None)]
-    for dims in dimensions:
-        mask_function = torch.vmap(mask_function, in_dims=dims, out_dims=0)
-    return mask_function
-
-
-def _non_vmap_expansion_sdpa(
-    batch_indices: torch.Tensor, head_indices: torch.Tensor, q_indices: torch.Tensor, kv_indices: torch.Tensor
-):
-    """
-    Used to broadcast our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
-    Allows the usage of any index-based mask function without relying on vmap.
-
-    NOTE: This is limited to index based functions only and is not guaranteed to work otherwise.
-
-    Reference:
-        - https://github.com/huggingface/optimum-onnx/blob/c123e8f4fab61b54a8e0e31ce74462bcacca576e/optimum/exporters/onnx/model_patcher.py#L362-L365
-    """
-    batch_indices = batch_indices[:, None, None, None]
-    head_indices = head_indices[None, :, None, None]
-    q_indices = q_indices[None, None, :, None]
-    kv_indices = kv_indices[None, None, None, :]
-    return batch_indices, head_indices, q_indices, kv_indices
-
-
-def sdpa_mask(
+def sdpa_mask_recent_torch(
    batch_size: int,
    cache_position: torch.Tensor,
    kv_length: int,
@ -312,8 +292,6 @@ def sdpa_mask(
    local_size: Optional[int] = None,
    allow_is_causal_skip: bool = True,
    allow_is_bidirectional_skip: bool = False,
-    allow_torch_fix: bool = True,
-    use_vmap: bool = False,
    **kwargs,
 ) -> Optional[torch.Tensor]:
    """
@ -346,12 +324,6 @@ def sdpa_mask(
        allow_is_bidirectional_skip (`bool`, optional):
            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
            i.e. full attention without any padding. Default to `False`.
-        allow_torch_fix (`bool`, optional):
-            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
-            versions. We need an arg to skip it when using eager. By default `True`.
-        use_vmap (`bool`, optional):
-            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
-            index-based (for the cost of speed performance). By default `False`.


    ## Creating a simple causal mask:
@ -419,8 +391,97 @@ def sdpa_mask(

    """
    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
+    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)

-    # Potentially pad the 2D mask
+    # Under specific conditions, we can avoid materializing the mask
+    #   1. Causal masks can rely on the `is_causal` argument
+    #   2. Bidirectional do not need any further processing (no bias)
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, kv_offset, local_size):
+        return None
+    if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(padding_mask):
+        return None
+
+    # vmap can incur performance issues as reported in #41566 for bidirectional mask as we only need to expand the
+    # padding mask. Thus, we allow early exit here if we do not detect any modification to the base mask function
+    if mask_function is bidirectional_mask_function:
+        if padding_mask is not None:
+            # used for slicing without data-dependent slicing
+            mask_indices = torch.arange(kv_length, device=cache_position.device) + kv_offset
+            return padding_mask[:, None, None, mask_indices].expand(-1, -1, q_length, -1)
+        else:
+            return torch.ones(batch_size, 1, q_length, kv_length, dtype=torch.bool, device=cache_position.device)
+
+    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
+    # but without data-dependent slicing (i.e. torch.compile friendly)
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset
+
+    # Potentially add the padding 2D mask
+    if padding_mask is not None:
+        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+
+    batch_arange = torch.arange(batch_size, device=cache_position.device)
+    head_arange = torch.arange(1, device=cache_position.device)
+    # This creates the 4D mask easily. Note that we need this context manager as vmap cannot handle slicing a tensor from
+    # scalar tensor (it internally calls `.item()` which vmap does not allow, but this context works around it
+    # We don't need to add an offset to the mask_function either, as we vmap directly the correct indices for k and kv indices
+    with TransformGetItemToIndex():
+        causal_mask = _vmap_for_bhqkv(mask_function)(batch_arange, head_arange, cache_position, kv_arange)
+
+    return causal_mask
+
+
+def sdpa_mask_older_torch(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    allow_is_bidirectional_skip: bool = False,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """
+    NOTE: This function is only used when torch version is torch<2.5 - see `sdpa_mask_recent_torch` otherwise.
+
+    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
+    the element should take part in the attention computation, and False that it should not.
+    If `allow_torch_fix=True` (the default), rows corresponding to query tokens that do not attend
+    to any other tokens (due to padding) will be fully attended to instead, in order to avoid `nan` propagation (this does
+    not change the final result).
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+        allow_is_bidirectional_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
+            i.e. full attention without any padding. Default to `False`.
+    """
+    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)

    # Under specific conditions, we can avoid materializing the mask
@ -431,45 +492,38 @@ def sdpa_mask(
    if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(padding_mask):
        return None

-    # Potentially add the padding 2D mask
-    if padding_mask is not None:
-        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+    # vmap can incur performance issues as reported in #41566 for bidirectional mask as we only need to expand the
+    # padding mask. Thus, we allow early exit here if we do not detect any modification to the base mask function
+    if mask_function is bidirectional_mask_function:
+        if padding_mask is not None:
+            return padding_mask[:, None, None, :].expand(-1, -1, q_length, -1)
+        else:
+            return torch.ones(batch_size, 1, q_length, kv_length, dtype=torch.bool, device=cache_position.device)

-    batch_arange = torch.arange(batch_size, device=cache_position.device)
-    head_arange = torch.arange(1, device=cache_position.device)
    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
    # but without data-dependent slicing (i.e. torch.compile friendly)
-    kv_arange = torch.arange(kv_length, device=cache_position.device) + kv_offset
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset

-    # Actual mask creation
-    # Option 1: Fast non-vmap mask creation (default)
-    if not use_vmap:
-        # Apply mask function element-wise through broadcasting
-        attention_mask = mask_function(*_non_vmap_expansion_sdpa(batch_arange, head_arange, cache_position, kv_arange))
-        # Expand the mask to match batch size and query length if they weren't used in the mask function
-        attention_mask = attention_mask.expand(batch_size, -1, q_length, kv_length)
-
-    # Option 2: Vmap mask creation (torch>=2.6 and custom patterns)
-    elif _is_torch_greater_or_equal_than_2_6:
-        # This creates the 4D mask easily. Note that we need this context manager as vmap cannot handle slicing a tensor from
-        # scalar tensor (it internally calls `.item()` which vmap does not allow, but this context works around it
-        # We don't need to add an offset to the mask_function either, as we vmap directly the correct indices for k and kv indices
-        with TransformGetItemToIndex():
-            attention_mask = _vmap_expansion_sdpa(mask_function)(batch_arange, head_arange, cache_position, kv_arange)
-
-    # Option 3: Error out since it indicates that the user did something custom, which they shouldn't have (torch<2.6)
-    else:
-        raise ValueError(
-            "The vmap functionality for mask creation is only supported from torch>=2.6. "
-            "Please update your torch version or use `use_vmap=False` with index-based masks."
-        )
+    # This creates the 4D mask easily. Note that we do not include vmap over the batch_idx dimension as well,
+    # as vmap cannot handle slicing a tensor from scalar tensor (it internally calls `.item()` which vmap does not allow
+    # However, in more recent version of Pytorch, a trick was introduced to handle it - which is the reason we have
+    # `sdpa_mask_recent_torch`, as it allows more general `mask_function`
+    causal_mask = _vmap_for_bhqkv(mask_function, bh_indices=False)(None, None, cache_position, kv_arange)
+    causal_mask = causal_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
+    if padding_mask is not None:
+        causal_mask = causal_mask * padding_mask[:, None, None, :]

    # Due to a bug in versions of torch<2.5, we need to update the mask in case a query is not attending to any
    # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
    if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
-        attention_mask = attention_mask | torch.all(~attention_mask, dim=-1, keepdim=True)
+        causal_mask |= torch.all(~causal_mask, dim=-1, keepdim=True)
+    return causal_mask

-    return attention_mask
+
+# We use the version with newer torch whenever possible, as it is more general and can handle arbitrary mask functions
+# (especially mask_function indexing a tensor, such as the padding mask function)
+sdpa_mask = sdpa_mask_recent_torch if _is_torch_greater_or_equal_than_2_6 else sdpa_mask_older_torch


 def eager_mask(
@ -480,7 +534,6 @@ def eager_mask(
    mask_function: Callable = causal_mask_function,
    attention_mask: Optional[torch.Tensor] = None,
    dtype: torch.dtype = torch.float32,
-    use_vmap: bool = False,
    **kwargs,
 ) -> torch.Tensor:
    """
@ -503,14 +556,10 @@ def eager_mask(
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
        dtype (`torch.dtype`, optional):
            The dtype to use for the mask. By default, `torch.float32`.
-        use_vmap (`bool`, optional):
-            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
-            index-based (for the cost of speed performance). By default `False`.
    """
    # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
    _ = kwargs.pop("allow_is_causal_skip", None)
    _ = kwargs.pop("allow_is_bidirectional_skip", None)
-    _ = kwargs.pop("allow_torch_fix", None)
    mask = sdpa_mask(
        batch_size=batch_size,
        cache_position=cache_position,
@ -521,7 +570,6 @@ def eager_mask(
        allow_is_causal_skip=False,
        allow_is_bidirectional_skip=False,
        allow_torch_fix=False,
-        use_vmap=use_vmap,
        **kwargs,
    )
    min_dtype = torch.finfo(dtype).min
@ -607,7 +655,7 @@ def flex_attention_mask(
        if not _is_torch_greater_or_equal_than_2_6 and pad_len > 0:
            attention_mask = torch.nn.functional.pad(attention_mask, value=0, pad=(0, pad_len))

-        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)
        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))

    # Add the offsets on top (because flex interface only allows length, not start and end indices)
@ -733,19 +781,9 @@ def _preprocess_mask_arguments(
    # If using a cache, it can give all information about mask sizes based on seen tokens
    if past_key_values is not None:
        kv_length, kv_offset = past_key_values.get_mask_sizes(cache_position, layer_idx)
-    # Otherwise, we infer based on our input
+    # Otherwise, the sizes are simply the input sizes
    else:
-        # 1. Rely on input directly
-        if attention_mask is None:
-            kv_length, kv_offset = input_embeds.shape[1], 0
-        # 2. Rely on the mask instead - needed for special cases like prefix tuning in PEFT
-        #
-        # This is a very unique and special case where an encoder utilizes a cache and expects its length
-        # to be accounted for (usually, they should never use a cache). In general, the mask should always
-        # match with the input sizes nonetheless (i.e. it does not affect others).
-        # Conclusion: "prefix tuning is evil"
-        else:
-            kv_length, kv_offset = attention_mask.shape[-1], 0
+        kv_length, kv_offset = input_embeds.shape[1], 0

    # We check the position_ids for potential packed sequence format (only if the 2D attention mask is explicitly None,
    # and we don't have past_key_values, i.e. generally a training setup)
@ -813,11 +851,6 @@ def create_causal_mask(
    mask_factory_function = causal_mask_function
    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]

-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False
-
    # Do not allow skip if we are compiling (this is to match BC)
    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
    if _is_torch_xpu_available:
@ -834,16 +867,14 @@ def create_causal_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True

    # If we detected packing format
-    if packed_sequence_mask is not None:
+    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
        allow_is_causal_skip = False

@ -858,7 +889,6 @@ def create_causal_mask(
        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return causal_mask

@ -912,10 +942,6 @@ def create_bidirectional_mask(

    # Allow skipping the mask creation except we have additional masking operators (and/or masks)
    allow_is_bidirectional_skip = True
-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False

    # Allow slight deviations from the base mask
    # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
@ -925,13 +951,11 @@ def create_bidirectional_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_bidirectional_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_bidirectional_skip = False
-        use_vmap = True

    # We now create the mask
    attention_mask = mask_interface(
@ -946,7 +970,6 @@ def create_bidirectional_mask(
        allow_is_bidirectional_skip=allow_is_bidirectional_skip,
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return attention_mask

@ -1009,10 +1032,6 @@ def create_sliding_window_causal_mask(
    mask_factory_function = sliding_window_causal_mask_function(sliding_window)
    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]

-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False
    # Do not allow skip if we are compiling (this is to match BC)
    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
    allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
@ -1025,16 +1044,14 @@ def create_sliding_window_causal_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True

    # If we detected packing format
-    if packed_sequence_mask is not None:
+    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
        allow_is_causal_skip = False

@ -1050,7 +1067,6 @@ def create_sliding_window_causal_mask(
        local_size=sliding_window,  # Additional kwarg for sdpa
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return causal_mask

@ -1124,13 +1140,20 @@ def create_chunked_causal_mask(
        left_padding_tokens = (attention_mask.cumsum(dim=-1) == torch.zeros_like(attention_mask)).sum(dim=-1)
    else:
        left_padding_tokens = torch.zeros(batch_size, device=cache_position.device, dtype=int)
+    # Raise a warning for older versions if the problematic left-padding situation arises
+    if (
+        not _is_torch_greater_or_equal_than_2_6
+        and kv_length + kv_offset > chunk_size
+        and (left_padding_tokens > 0).any()
+    ):
+        logger.warning_once(
+            "Due to limitations of your current torch version, we cannot correctly account for the left-padding "
+            "when computing the chunked attention pattern. This will lead to a wrong attention mask for the padded "
+            "sequences. Behavior will be undefined. Please upgrade to `torch>=2.6` to solve this issue."
+        )
    mask_factory_function = chunked_causal_mask_function(chunk_size, left_padding_tokens)
    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]

-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False
    # Do not allow skip if we are compiling (this is to match BC)
    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
    allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
@ -1143,16 +1166,14 @@ def create_chunked_causal_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True

    # If we detected packing format
-    if packed_sequence_mask is not None:
+    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
        allow_is_causal_skip = False

@ -1168,7 +1189,6 @@ def create_chunked_causal_mask(
        local_size=chunk_size,  # Additional kwarg for sdpa
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return causal_mask

--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -25,7 +25,6 @@ if TYPE_CHECKING:
    from .arcee import *
    from .aria import *
    from .audio_spectrogram_transformer import *
-    from .audioflamingo3 import *
    from .auto import *
    from .autoformer import *
    from .aya_vision import *
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@ -59,6 +59,9 @@ class AlignProcessor(ProcessorMixin):

    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "EfficientNetImageProcessor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
    valid_processor_kwargs = AlignProcessorKwargs

    def __init__(self, image_processor, tokenizer):
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@ -35,6 +35,10 @@ class AltCLIPProcessor(ProcessorMixin):
            The tokenizer is a required input.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
+    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
+
    @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
    def __init__(self, image_processor=None, tokenizer=None):
        super().__init__(image_processor, tokenizer)
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@ -906,6 +906,10 @@ class AriaProcessor(ProcessorMixin):
            A dictionary indicating size conversions for images.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AriaImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@ -67,6 +67,10 @@ class AriaProcessor(ProcessorMixin):
            A dictionary indicating size conversions for images.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AriaImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@ -272,9 +272,7 @@ if __name__ == "__main__":
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model to the Hugging Face hub.",
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )

    args = parser.parse_args()
--- a/src/transformers/models/audioflamingo3/init.py
+++ b/src/transformers/models/audioflamingo3/init.py
@ -1,31 +0,0 @@
-# coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule
-from ...utils.import_utils import define_import_structure
-
-
-if TYPE_CHECKING:
-    from .configuration_audioflamingo3 import *
-    from .modeling_audioflamingo3 import *
-    from .processing_audioflamingo3 import *
-else:
-    import sys
-
-    _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
--- a/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py
@ -1,210 +0,0 @@
-# coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ..auto import CONFIG_MAPPING, AutoConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-class AudioFlamingo3EncoderConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of an [`AudioFlamingo3Encoder`]. It is used to instantiate an
-    AudioFlamingo3 audio encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the audio encoder of the AudioFlamingo3
-    architecture.
-
-    e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        num_mel_bins (`int`, *optional*, defaults to 128):
-            Number of mel features used per input features. Should correspond to the value used in the
-            `AudioFlamingo3Processor` class.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of encoder layers.
-        num_attention_heads (`int`, *optional*, defaults to 20):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 5120):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
-        layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](https://huggingface.co/papers/1909.11556)
-            for more details.
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        hidden_size (`int`, *optional*, defaults to 1280):
-            Dimensionality of the layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_embedding (`bool`, *optional*, defaults to `False`):
-            Scale embeddings by dividing by sqrt(hidden_size).
-        max_source_positions (`int`, *optional*, defaults to 1500):
-            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
-
-    Example:
-
-    ```python
-    >>> from transformers import AudioFlamingo3EncoderConfig, AudioFlamingo3Encoder
-
-    >>> # Initializing an AudioFlamingo3EncoderConfig
-    >>> configuration = AudioFlamingo3EncoderConfig()
-
-    >>> # Initializing an AudioFlamingo3Encoder (with random weights)
-    >>> model = AudioFlamingo3Encoder(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "audioflamingo3_encoder"
-
-    attribute_map = {
-        "d_model": "hidden_size",
-        "encoder_layers": "num_hidden_layers",
-        "encoder_attention_heads": "num_attention_heads",
-        "encoder_ffn_dim": "intermediate_size",
-        "encoder_layerdrop": "layerdrop",
-    }
-
-    def __init__(
-        self,
-        num_mel_bins=128,
-        num_hidden_layers=32,
-        num_attention_heads=20,
-        intermediate_size=5120,
-        layerdrop=0.0,
-        activation_function="gelu",
-        hidden_size=1280,
-        dropout=0.0,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
-        initializer_range=0.02,
-        scale_embedding=False,
-        max_source_positions=1500,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.num_mel_bins = num_mel_bins
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.activation_function = activation_function
-        self.initializer_range = initializer_range
-        self.layerdrop = layerdrop
-        self.num_hidden_layers = num_hidden_layers
-        self.scale_embedding = scale_embedding
-        self.max_source_positions = max_source_positions
-
-
-class AudioFlamingo3Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of an [`AudioFlamingo3ForConditionalGeneration`]. It is used to instantiate an
-    AudioFlamingo3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the AudioFlamingo3.
-
-    e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        audio_config (`Union[AudioFlamingo3EncoderConfig, dict]`, *optional*, defaults to `AudioFlamingo3EncoderConfig`):
-            The config object or dictionary of the audio backbone.
-        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
-            The config object or dictionary of the text backbone.
-        audio_token_id (`int`, *optional*, defaults to 151669):
-            The audio token index to encode the audio prompt.
-        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
-            Activation function used in the projector.
-        projector_bias (`bool`, *optional*, defaults to `True`):
-            Whether to include bias terms in the projector.
-
-    Example:
-
-    ```python
-    >>> from transformers import AudioFlamingo3ForConditionalGeneration, AudioFlamingo3Config, AudioFlamingo3EncoderConfig, Qwen2Config
-
-    >>> # Initializing an AudioFlamingo3Encoder config
-    >>> audio_config = AudioFlamingo3EncoderConfig()
-
-    >>> # Initializing a Qwen2 config
-    >>> text_config = Qwen2Config()
-
-    >>> # Initializing an AudioFlamingo3 configuration
-    >>> configuration = AudioFlamingo3Config(audio_config, text_config)
-
-    >>> # Initializing a model from the audioflamingo3 style configuration
-    >>> model = AudioFlamingo3ForConditionalGeneration(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "audioflamingo3"
-    sub_configs = {
-        "audio_config": AudioFlamingo3EncoderConfig,
-        "text_config": AutoConfig,
-    }
-
-    def __init__(
-        self,
-        audio_config=None,
-        text_config=None,
-        audio_token_id=151669,
-        projector_hidden_act="gelu",
-        projector_bias=True,
-        **kwargs,
-    ):
-        self.audio_token_id = audio_token_id
-
-        if isinstance(audio_config, dict):
-            audio_config["model_type"] = audio_config.get("model_type", "audioflamingo3_encoder")
-            audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
-        elif audio_config is None:
-            audio_config = CONFIG_MAPPING["audioflamingo3_encoder"]()
-
-        self.audio_config = audio_config
-
-        if isinstance(text_config, dict):
-            text_config["model_type"] = text_config.get("model_type", "qwen2")
-            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-        elif text_config is None:
-            text_config = CONFIG_MAPPING["qwen2"]()
-
-        self.text_config = text_config
-        self.projector_hidden_act = projector_hidden_act
-        self.projector_bias = projector_bias
-
-        super().__init__(**kwargs)
-
-
-__all__ = ["AudioFlamingo3Config", "AudioFlamingo3EncoderConfig"]
--- a/src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py
+++ b/src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py
@ -1,286 +0,0 @@
-# coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert AudioFlamingo3 checkpoints into a Hugging Face repository layout."""
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Any
-
-import torch
-from safetensors.torch import safe_open
-
-from transformers import (
-    AudioFlamingo3Config,
-    AudioFlamingo3ForConditionalGeneration,
-    AudioFlamingo3Processor,
-    AutoTokenizer,
-    GenerationConfig,
-    Qwen2Config,
-    WhisperFeatureExtractor,
-)
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
-
-
-def _load_json(p: Path):
-    if not p.is_file():
-        raise FileNotFoundError(f"Missing JSON: {p}")
-    with p.open("r", encoding="utf-8") as f:
-        return json.load(f)
-
-
-def write_processor(src_root: Path, dst_root: Path):
-    llm_dir = src_root / "llm"
-
-    # fmt: off
-    tokenizer_chat_template = (
-        "{% if messages[0]['role'] != 'system' %}"
-            "{{ '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}"
-        "{% endif %}"
-        "{% for message in messages if message['content'] is not none %}"
-            "{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}"
-            "{{ '<|im_start|>assistant\\n' }}"
-        "{% endif %}"
-    )
-    # fmt: on
-
-    # fmt: off
-    processor_chat_template = (
-        "{% if messages[0]['role'] != 'system' %}"
-            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-        "{% endif %}"
-        "{% for m in messages if m['content'] is not none %}"
-            "<|im_start|>{{ m['role'] }}\n"
-            "{% if m['content'] is string %}"
-                "{{ m['content'] }}"
-            "{% else %}"
-                "{% set audio = namespace(found=False) %}"
-                "{% set text_buf = namespace(v='') %}"
-                "{% for c in m['content'] %}"
-                    "{% if c.get('type') == 'audio' or 'audio' in c %}"
-                        "{% set audio.found = True %}"
-                    "{% elif c.get('type') == 'text' or 'text' in c %}"
-                        "{% set text_buf.v = text_buf.v + c['text'] %}"
-                    "{% endif %}"
-                "{% endfor %}"
-                "{% if audio.found %}{{ '<sound>' }}{% endif %}{{ text_buf.v }}"
-            "{% endif %}"
-            "<|im_end|>\n"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}"
-            "<|im_start|>assistant\n"
-        "{% endif %}"
-    )
-    # fmt: on
-
-    processor = AudioFlamingo3Processor(
-        feature_extractor=WhisperFeatureExtractor(feature_size=128, return_attention_mask=True),
-        tokenizer=AutoTokenizer.from_pretrained(str(llm_dir), chat_template=tokenizer_chat_template, use_fast=True),
-        chat_template=processor_chat_template,
-    )
-    processor.save_pretrained(str(dst_root))
-
-    logger.info("processor (tokenizer + preprocessor)")
-    return processor
-
-
-PREFIX_MAP = {
-    "llm": "language_model",
-    "sound_tower": "audio_tower",
-    "sound_mm_projector": "multi_modal_projector",
-}
-
-
-def _resolve_component_dir(dirpath: Path):
-    if not dirpath.is_dir():
-        return None
-    idx = dirpath / "model.safetensors.index.json"
-    mono = dirpath / "model.safetensors"
-    if idx.exists():
-        wm = _load_json(idx).get("weight_map") or {}
-        by_shard: dict[str, list[str]] = defaultdict(list)
-        for k, shard in wm.items():
-            by_shard[shard].append(k)
-        return ("sharded", dirpath, {k: sorted(v) for k, v in sorted(by_shard.items())})
-    if mono.exists():
-        return ("file", mono)
-    cands = sorted([x for x in dirpath.iterdir() if x.suffix == ".safetensors"])
-    return ("file", cands[0]) if len(cands) == 1 else None
-
-
-def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: AudioFlamingo3Processor):
-    state: dict[str, Any] = {}
-    for tag in PREFIX_MAP.keys():
-        comp = _resolve_component_dir(src_root / tag)
-        if not comp:
-            continue
-
-        out_prefix = PREFIX_MAP.get(tag, tag)
-
-        if comp[0] == "file":
-            fp: Path = comp[1]
-            with safe_open(str(fp), framework="pt", device="cpu") as f:
-                for k in f.keys():
-                    if k == "__metadata__":
-                        continue
-                    state[f"{out_prefix}.{k}"] = f.get_tensor(k)
-        else:
-            base: Path = comp[1]
-            shard_map: dict[str, list[str]] = comp[2]
-            for shard, keys in shard_map.items():
-                sp = base / shard
-                with safe_open(str(sp), framework="pt", device="cpu") as f:
-                    for k in keys:
-                        state[f"{out_prefix}.{k}"] = f.get_tensor(k)
-
-    if not state:
-        raise FileNotFoundError("No tensors found in llm/, sound_tower/, or sound_mm_projector/.")
-
-    tok = processor.tokenizer
-
-    text_config = Qwen2Config(
-        bos_token_id=tok.bos_token_id,
-        eos_token_id=tok.eos_token_id,
-        pad_token_id=tok.pad_token_id,
-        vocab_size=len(tok),
-        hidden_size=3584,
-        intermediate_size=18944,
-        model_max_length=8192,
-        num_attention_heads=28,
-        num_hidden_layers=28,
-        num_key_value_heads=4,
-        rope_theta=1000000.0,
-        use_cache=False,
-    )
-    config = AudioFlamingo3Config(text_config=text_config, audio_token_id=tok.get_vocab()["<sound>"])
-    model = AudioFlamingo3ForConditionalGeneration(config).to(dtype=torch.bfloat16)
-
-    # Update state dict to new key names if necessary
-    projector_key_mapping = {
-        "multi_modal_projector.layers.0.weight": "multi_modal_projector.linear_1.weight",
-        "multi_modal_projector.layers.0.bias": "multi_modal_projector.linear_1.bias",
-        "multi_modal_projector.layers.2.weight": "multi_modal_projector.linear_2.weight",
-        "multi_modal_projector.layers.2.bias": "multi_modal_projector.linear_2.bias",
-    }
-    for old_key, new_key in projector_key_mapping.items():
-        if old_key in state:
-            state[new_key] = state.pop(old_key)
-
-    # Load weights into the instantiated model so we can push via `push_to_hub` later.
-    load_res = model.load_state_dict(state, strict=True)
-    # Enforce a clean load
-    if getattr(load_res, "missing_keys", None) and load_res.missing_keys:
-        mk = load_res.missing_keys
-        raise ValueError(f"Missing keys when loading: {mk[:10]}{' ...' if len(mk) > 10 else ''}")
-    if getattr(load_res, "unexpected_keys", None) and load_res.unexpected_keys:
-        uk = load_res.unexpected_keys
-        raise ValueError(f"Unexpected keys when loading: {uk[:10]}{' ...' if len(uk) > 10 else ''}")
-
-    generation_config = GenerationConfig(
-        bos_token_id=tok.bos_token_id,
-        eos_token_id=tok.eos_token_id,
-        pad_token_id=tok.pad_token_id,
-        max_new_tokens=2048,
-    )
-    model.generation_config = generation_config
-
-    model.save_pretrained(save_directory=str(dst_root))
-    logger.info("model.safetensors index and shards")
-    return model
-
-
-"""
-Reproducible Usage
-==================
-
-1) Download the original AudioFlamingo-3 weights from NVIDIA (requires Git LFS):
-
-```
-git lfs install
-git clone https://huggingface.co/nvidia/audio-flamingo-3
-```
-
-This will create a folder `audio-flamingo-3/` containing the original components:
-`llm/`, `sound_tower/`, and `sound_mm_projector/`.
-
-2) Convert to the Hugging Face Transformers format (locally):
-
-```
-python src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py \
-  --src_dir audio-flamingo-3 \
-  --dst_dir audio-flamingo-3-hf
-```
-
-3) Convert and push directly to the Hub (requires `huggingface-cli login` or `HF_TOKEN`):
-
-```
-python src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py \
-  --src_dir audio-flamingo-3 \
-  --dst_dir audio-flamingo-3-hf \
-  --push_to_hub <username-or-org>/audio-flamingo-3
-```
-
-This command uploads both the processor (tokenizer + feature extractor) and the converted
-model (sharded safetensors + configs) to the specified Hub repository.
-"""
-
-
-def main() -> None:
-    ap = argparse.ArgumentParser(description="Convert AudioFlamingo3 to Hugging Face format.")
-    ap.add_argument("--src_dir", required=True, help="Source model root directory")
-    ap.add_argument("--dst_dir", required=True, help="Destination directory for converted model")
-    ap.add_argument(
-        "--push_to_hub",
-        default=None,
-        type=str,
-        help=(
-            "Optional repository ID to push the converted assets to the Hugging Face Hub, "
-            "e.g. 'username/audio-flamingo-3'."
-        ),
-    )
-    args = ap.parse_args()
-
-    src_root = Path(args.src_dir).resolve()
-    if not src_root.is_dir():
-        raise FileNotFoundError(f"Source directory not found: {src_root}")
-
-    dst_root = Path(args.dst_dir).resolve()
-    if dst_root.exists():
-        raise FileExistsError(f"Destination already exists: {dst_root}")
-
-    processor = write_processor(src_root, dst_root)
-    model = merge_and_shard_weights(src_root, dst_root, processor)
-
-    # Optionally push converted assets using native push_to_hub only
-    if args.push_to_hub:
-        logger.info("Pushing processor to the Hub ...")
-        processor.push_to_hub(args.push_to_hub)
-        logger.info("Pushing model to the Hub ...")
-        model.push_to_hub(args.push_to_hub)
-
-
-if __name__ == "__main__":
-    main()
--- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
@ -1,628 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/audioflamingo3/modular_audioflamingo3.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_audioflamingo3.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from collections.abc import Callable
-from typing import Optional, Union
-
-import torch
-from torch import nn
-
-from ...activations import ACT2FN
-from ...cache_utils import Cache, EncoderDecoderCache
-from ...generation import GenerationMixin
-from ...masking_utils import eager_mask, padding_mask_function
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
-from ..auto import AutoModel, AutoModelForCausalLM
-from .configuration_audioflamingo3 import AudioFlamingo3Config, AudioFlamingo3EncoderConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: Optional[float] = None,
-    dropout: float = 0.0,
-    **kwargs,
-):
-    if scaling is None:
-        scaling = query.size(-1) ** -0.5
-
-    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
-    if attention_mask is not None and attention_mask.ndim == 4:
-        attn_weights = attn_weights + attention_mask[:, :, :, : key.shape[-2]]
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
-class AudioFlamingo3Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
-        is_causal: bool = False,
-        layer_idx: Optional[int] = None,
-        config: Optional[AudioFlamingo3Config] = None,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        self.config = config
-
-        if (self.head_dim * num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-        self.is_decoder = is_decoder
-        self.is_causal = is_causal
-
-        if layer_idx is None and is_decoder:
-            logger.warning_once(
-                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
-                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-        self.layer_idx = layer_idx
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Cache] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        cache_position: Optional[torch.Tensor] = None,
-        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
-        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        # determine input shapes
-        bsz, tgt_len = hidden_states.shape[:-1]
-        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
-
-        # Scaling is susceptible to floating point arithmetics' inprecisions
-        # which can lead to different results (this is dependent from model
-        # to model, e.g. audioflamingo3 is one such case). We therefore keep the
-        # original order of scaling to follow the original implementation
-        # and enforce no scaling (1.0) in the attention call below.
-        query_states = self.q_proj(hidden_states) * self.scaling
-        query_states = query_states.view(*q_input_shape)
-        query_states = query_states.transpose(1, 2).contiguous()
-
-        # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache`
-        if past_key_values is not None and isinstance(past_key_values, EncoderDecoderCache):
-            is_updated = past_key_values.is_updated.get(self.layer_idx)
-            if is_cross_attention:
-                # after the first generated id, we can subsequently re-use all key/value_states from cache
-                past_key_values.is_updated[self.layer_idx] = True
-                past_key_values = past_key_values.cross_attention_cache
-            else:
-                past_key_values = past_key_values.self_attention_cache
-
-        # use key_value_states if cross attention
-        current_states = key_value_states if key_value_states is not None else hidden_states
-        if is_cross_attention and past_key_values and is_updated:
-            # reuse k,v, cross_attentions
-            key_states = past_key_values.layers[self.layer_idx].keys
-            value_states = past_key_values.layers[self.layer_idx].values
-        else:
-            key_states = self.k_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
-            value_states = self.v_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
-            key_states = key_states.transpose(1, 2).contiguous()
-            value_states = value_states.transpose(1, 2).contiguous()
-            if past_key_values is not None:
-                # save all key/value_states to cache to be re-used for fast auto-regressive generation
-                cache_position = cache_position if not is_cross_attention else None
-                key_states, value_states = past_key_values.update(
-                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
-                )
-
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.dropout,
-            scaling=1.0,
-            output_attentions=output_attentions,
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-class AudioFlamingo3EncoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: AudioFlamingo3Config):
-        super().__init__()
-        self.embed_dim = config.d_model
-
-        self.self_attn = AudioFlamingo3Attention(
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
-            config=config,
-        )
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
-        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        output_attentions: bool = False,
-    ) -> torch.Tensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        residual = hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        return hidden_states, attn_weights
-
-
-@auto_docstring
-class AudioFlamingo3PreTrainedModel(PreTrainedModel):
-    config: AudioFlamingo3Config
-    base_model_prefix = "model"
-    input_modalities = ["audio", "text"]
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["AudioFlamingo3Attention"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn = True
-    _supports_sdpa = True
-
-    def _init_weights(self, module):
-        # important: this ported version of AudioFlamingo3 isn't meant for training from scratch - only
-        # inference and fine-tuning - so the proper init weights code has been removed
-        std = (
-            self.config.initializer_range
-            if hasattr(self.config, "initializer_range")
-            else self.config.audio_config.initializer_range
-        )
-
-        if isinstance(module, (nn.Linear, nn.Conv1d)):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.weight.data.fill_(1.0)
-            module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-@auto_docstring(
-    custom_intro="""
-    The audio model from AudioFlamingo3 without any head or projection on top.
-    """
-)
-class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
-    """
-    AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
-    """
-
-    # Ignore copy
-    config: AudioFlamingo3EncoderConfig
-    main_input_name = "input_features"
-    input_modalities = "audio"
-    _no_split_modules = ["AudioFlamingo3EncoderLayer"]
-
-    def __init__(self, config: AudioFlamingo3EncoderConfig):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.encoder_layerdrop
-
-        embed_dim = config.d_model
-        self.num_mel_bins = config.num_mel_bins
-        self.padding_idx = config.pad_token_id
-        self.max_source_positions = config.max_source_positions
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-
-        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
-        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
-
-        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
-        self.embed_positions.requires_grad_(False)
-
-        self.layers = nn.ModuleList([AudioFlamingo3EncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layer_norm = nn.LayerNorm(config.d_model)
-        # Ignore copy
-        self.avg_pooler = nn.AvgPool1d(2, stride=2)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def _freeze_parameters(self):
-        for param in self.parameters():
-            param.requires_grad = False
-        self._requires_grad = False
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.conv1
-
-    def set_input_embeddings(self, value: nn.Module):
-        self.conv1 = value
-
-    @can_return_tuple
-    def forward(
-        self,
-        input_features: torch.Tensor,
-        input_features_mask: Optional[torch.Tensor] = None,
-    ):
-        r"""
-        Args:
-            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
-                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
-                these features from waveform input.
-            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-        """
-
-        # Prepare attention mask for transformer layers
-        batch_size = input_features.shape[0]
-        seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
-
-        input_features_lengths = input_features_mask.sum(-1)
-        input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
-        input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
-        attention_mask = eager_mask(
-            batch_size=batch_size,
-            cache_position=torch.arange(seq_len, device=input_features.device),
-            kv_length=seq_len,
-            mask_function=padding_mask_function(input_features_mask),
-            dtype=self.conv1.weight.dtype,
-        )
-
-        # Conv front-end
-        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
-        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
-        inputs_embeds = inputs_embeds.permute(0, 2, 1)
-
-        # Add positions, dropout
-        hidden_states = inputs_embeds + self.embed_positions.weight
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # Transformer stack
-        for layer in self.layers:
-            drop = self.training and torch.rand([]) < self.layerdrop
-            if not drop:
-                hidden_states = layer(hidden_states, attention_mask)[0]
-
-        # AvgPool (time/2) + LayerNorm
-        hidden_states = hidden_states.permute(0, 2, 1)
-        hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
-        hidden_states = self.layer_norm(hidden_states)
-
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-        )
-
-    # Ignore copy
-    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
-        """
-        Computes the output length of the convolutional layers and the output length of the audio encoder
-        """
-        input_lengths = (input_lengths - 1) // 2 + 1
-        output_lengths = (input_lengths - 2) // 2 + 1
-        return input_lengths, output_lengths
-
-
-class AudioFlamingo3MultiModalProjector(nn.Module):
-    """
-    Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
-    to the LLM embedding space so they can replace `<sound>` tokens.
-    """
-
-    def __init__(self, config: AudioFlamingo3Config):
-        super().__init__()
-        self.linear_1 = nn.Linear(
-            config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
-        self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(
-            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
-
-    def forward(self, audio_features):
-        hidden_states = self.linear_1(audio_features)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        return hidden_states
-
-
-@auto_docstring(
-    custom_intro="""
-    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
-    """
-)
-class AudioFlamingo3ForConditionalGeneration(AudioFlamingo3PreTrainedModel, GenerationMixin):
-    _tied_weights_keys = None
-    _tp_plan = None
-    _pp_plan = None
-    _keep_in_fp32_modules_strict = None
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.vocab_size = config.text_config.vocab_size
-        self.audio_tower = AutoModel.from_config(config.audio_config)
-        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
-        self.multi_modal_projector = AudioFlamingo3MultiModalProjector(config)
-        # Similar to Qwen2Audio
-        if self.language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.language_model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.language_model.set_input_embeddings(value)
-
-    def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.language_model.set_output_embeddings(new_embeddings)
-
-    def set_decoder(self, decoder):
-        self.language_model.set_decoder(decoder)
-
-    def get_decoder(self):
-        return self.language_model.get_decoder()
-
-    def get_audio_features(
-        self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
-    ) -> torch.FloatTensor:
-        """
-        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
-        Args:
-            input_features (`torch.FloatTensor`):
-                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
-                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
-                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
-            input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
-                Mask to avoid performing attention on padded feature indices.
-
-        Returns:
-            `torch.FloatTensor`:
-                The audio embeddings.
-        """
-
-        # Encode audio
-        encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
-        audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
-
-        # Mask according to avg pooling (which is after attention blocks)
-        post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
-        valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
-        audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
-        return audio_embeds
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        input_features: Optional[torch.FloatTensor] = None,
-        input_features_mask: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        logits_to_keep: Union[int, torch.Tensor] = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> CausalLMOutputWithPast:
-        r"""
-        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
-            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-        >>> model_id = "nvidia/audio-flamingo-3-hf"
-        >>> processor = AutoProcessor.from_pretrained(model_id)
-        >>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-        >>> conversations = [
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {"type": "text", "text": "Transcribe the input speech."},
-        >>>                 {
-        >>>                     "type": "audio",
-        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
-        >>>                 },
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {
-        >>>                     "type": "text",
-        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
-        >>>                 },
-        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>> ]
-
-        >>> inputs = processor.apply_chat_template(
-        >>>     conversations,
-        >>>     tokenize=True,
-        >>>     add_generation_prompt=True,
-        >>>     return_dict=True,
-        >>> ).to(model.device)
-
-        >>> outputs = model.generate(**inputs, max_new_tokens=500)
-
-        >>> decoded_outputs = processor.batch_decode(
-        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
-        >>> )
-        >>> print(decoded_outputs)
-        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
-        ```"""
-
-        if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-        if input_features is not None and input_ids is not None:
-            audio_embeds = self.get_audio_features(input_features, input_features_mask)
-
-            # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
-            )
-
-        outputs: CausalLMOutputWithPast = self.language_model(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            labels=labels,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            logits_to_keep=logits_to_keep,
-            **kwargs,
-        )
-        return outputs
-
-    def prepare_inputs_for_generation(self, *args, **kwargs):
-        # Overwritten -- we should not pass input_features when we are in cached decoding stage
-
-        input_features = kwargs.pop("input_features", None)
-        input_features_mask = kwargs.pop("input_features_mask", None)
-        cache_position = kwargs.get("cache_position")
-
-        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-
-        if cache_position is not None and cache_position[0] == 0:
-            # input_features should only be passed when we are not in cached decoding stage
-            if input_features is not None:
-                model_inputs["input_features"] = input_features
-            if input_features_mask is not None:
-                model_inputs["input_features_mask"] = input_features_mask
-
-        return model_inputs
-
-
-__all__ = ["AudioFlamingo3ForConditionalGeneration", "AudioFlamingo3PreTrainedModel", "AudioFlamingo3Encoder"]
--- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
@ -1,307 +0,0 @@
-# coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-import torch
-from torch import nn
-
-from ...activations import ACT2FN
-from ...cache_utils import Cache
-from ...masking_utils import eager_mask, padding_mask_function
-from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
-from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
-from ..qwen2_audio.modeling_qwen2_audio import (
-    Qwen2AudioEncoder,
-    Qwen2AudioPreTrainedModel,
-)
-from ..voxtral.modeling_voxtral import VoxtralForConditionalGeneration, VoxtralMultiModalProjector
-from ..whisper.modeling_whisper import WhisperEncoderLayer
-from .configuration_audioflamingo3 import AudioFlamingo3Config
-
-
-logger = logging.get_logger(__name__)
-
-
-class AudioFlamingo3EncoderLayer(WhisperEncoderLayer):
-    pass
-
-
-class AudioFlamingo3PreTrainedModel(Qwen2AudioPreTrainedModel):
-    pass
-
-
-@auto_docstring(
-    custom_intro="""
-    The audio model from AudioFlamingo3 without any head or projection on top.
-    """
-)
-class AudioFlamingo3Encoder(Qwen2AudioEncoder):
-    """
-    AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
-    """
-
-    @can_return_tuple
-    def forward(
-        self,
-        input_features: torch.Tensor,
-        input_features_mask: Optional[torch.Tensor] = None,
-    ):
-        r"""
-        Args:
-            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
-                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
-                these features from waveform input.
-            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-        """
-
-        # Prepare attention mask for transformer layers
-        batch_size = input_features.shape[0]
-        seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
-
-        input_features_lengths = input_features_mask.sum(-1)
-        input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
-        input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
-        attention_mask = eager_mask(
-            batch_size=batch_size,
-            cache_position=torch.arange(seq_len, device=input_features.device),
-            kv_length=seq_len,
-            mask_function=padding_mask_function(input_features_mask),
-            dtype=self.conv1.weight.dtype,
-        )
-
-        # Conv front-end
-        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
-        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
-        inputs_embeds = inputs_embeds.permute(0, 2, 1)
-
-        # Add positions, dropout
-        hidden_states = inputs_embeds + self.embed_positions.weight
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # Transformer stack
-        for layer in self.layers:
-            drop = self.training and torch.rand([]) < self.layerdrop
-            if not drop:
-                hidden_states = layer(hidden_states, attention_mask)[0]
-
-        # AvgPool (time/2) + LayerNorm
-        hidden_states = hidden_states.permute(0, 2, 1)
-        hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
-        hidden_states = self.layer_norm(hidden_states)
-
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-        )
-
-
-class AudioFlamingo3MultiModalProjector(VoxtralMultiModalProjector):
-    """
-    Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
-    to the LLM embedding space so they can replace `<sound>` tokens.
-    """
-
-    def __init__(self, config: AudioFlamingo3Config):
-        super().__init__()
-        self.linear_1 = nn.Linear(
-            config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
-        self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(
-            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
-
-
-@auto_docstring(
-    custom_intro="""
-    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
-    """
-)
-class AudioFlamingo3ForConditionalGeneration(VoxtralForConditionalGeneration):
-    _tied_weights_keys = None
-    _tp_plan = None
-    _pp_plan = None
-    _keep_in_fp32_modules_strict = None
-
-    def __init__(self, config):
-        super().__init__(config)
-        # Similar to Qwen2Audio
-        if self.language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
-
-    def get_audio_features(
-        self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
-    ) -> torch.FloatTensor:
-        """
-        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
-        Args:
-            input_features (`torch.FloatTensor`):
-                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
-                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
-                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
-            input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
-                Mask to avoid performing attention on padded feature indices.
-
-        Returns:
-            `torch.FloatTensor`:
-                The audio embeddings.
-        """
-
-        # Encode audio
-        encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
-        audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
-
-        # Mask according to avg pooling (which is after attention blocks)
-        post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
-        valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
-        audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
-        return audio_embeds
-
-    def get_audio_embeds(self):
-        raise NotImplementedError("This method is not supported for AudioFlamingo3ForConditionalGeneration.")
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        input_features: Optional[torch.FloatTensor] = None,
-        input_features_mask: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        logits_to_keep: Union[int, torch.Tensor] = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> CausalLMOutputWithPast:
-        r"""
-        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
-            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
-
-        >>> model_id = "nvidia/audio-flamingo-3-hf"
-        >>> processor = AutoProcessor.from_pretrained(model_id)
-        >>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-        >>> conversations = [
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {"type": "text", "text": "Transcribe the input speech."},
-        >>>                 {
-        >>>                     "type": "audio",
-        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
-        >>>                 },
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {
-        >>>                     "type": "text",
-        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
-        >>>                 },
-        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>> ]
-
-        >>> inputs = processor.apply_chat_template(
-        >>>     conversations,
-        >>>     tokenize=True,
-        >>>     add_generation_prompt=True,
-        >>>     return_dict=True,
-        >>> ).to(model.device)
-
-        >>> outputs = model.generate(**inputs, max_new_tokens=500)
-
-        >>> decoded_outputs = processor.batch_decode(
-        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
-        >>> )
-        >>> print(decoded_outputs)
-        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
-        ```"""
-
-        if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-        if input_features is not None and input_ids is not None:
-            audio_embeds = self.get_audio_features(input_features, input_features_mask)
-
-            # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
-            )
-
-        outputs: CausalLMOutputWithPast = self.language_model(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            labels=labels,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            logits_to_keep=logits_to_keep,
-            **kwargs,
-        )
-        return outputs
-
-    def prepare_inputs_for_generation(self, *args, **kwargs):
-        # Overwritten -- we should not pass input_features when we are in cached decoding stage
-
-        input_features = kwargs.pop("input_features", None)
-        input_features_mask = kwargs.pop("input_features_mask", None)
-        cache_position = kwargs.get("cache_position")
-
-        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-
-        if cache_position is not None and cache_position[0] == 0:
-            # input_features should only be passed when we are not in cached decoding stage
-            if input_features is not None:
-                model_inputs["input_features"] = input_features
-            if input_features_mask is not None:
-                model_inputs["input_features_mask"] = input_features_mask
-
-        return model_inputs
-
-
-__all__ = ["AudioFlamingo3ForConditionalGeneration", "AudioFlamingo3PreTrainedModel", "AudioFlamingo3Encoder"]
--- a/src/transformers/models/audioflamingo3/processing_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/processing_audioflamingo3.py
@ -1,318 +0,0 @@
-# coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from typing import Optional, Union
-
-import numpy as np
-
-from ...audio_utils import AudioInput, make_list_of_audio
-from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import TextInput
-from ...utils import is_torch_available, logging
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.get_logger(__name__)
-
-MAX_AUDIO_LEN = 10 * 60  # 10 minutes
-DEFAULT_TRANSCRIPTION_PROMPT = "Transcribe the input speech."
-
-
-class AudioFlamingo3ProcessorKwargs(ProcessingKwargs, total=False):
-    _defaults = {
-        "text_kwargs": {
-            "padding": True,
-        },
-        "audio_kwargs": {
-            "sampling_rate": 16000,
-            "chunk_length": 30.0,
-            "return_attention_mask": True,
-            "padding": "max_length",
-        },
-        "common_kwargs": {
-            "return_tensors": "pt",
-            "padding_side": "left",
-        },
-    }
-
-
-class AudioFlamingo3Processor(ProcessorMixin):
-    r"""
-    Constructs an AudioFlamingo3 processor which wraps an AudioFlamingo3 feature extractor and an AudioFlamingo3
-    tokenizer into a single processor.
-
-    [`AudioFlamingo3Processor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
-    [`Qwen2TokenizerFast`]. See the [`~AudioFlamingo3Processor.__call__`] for more information.
-
-    Args:
-        feature_extractor ([`WhisperFeatureExtractor`]):
-            The feature extractor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`Optional[str]`, *optional*):
-            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
-            template will be used.
-        audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
-            Special token used to represent audio inputs in the chat template.
-    """
-
-    attributes = ["feature_extractor", "tokenizer"]
-    feature_extractor_class = "WhisperFeatureExtractor"
-    tokenizer_class = "Qwen2TokenizerFast"
-
-    def __init__(
-        self,
-        feature_extractor,
-        tokenizer,
-        chat_template=None,
-        audio_token="<sound>",
-    ):
-        self.audio_token = audio_token
-        self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
-        super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
-
-    def __call__(
-        self,
-        text: Union[TextInput, list[TextInput]],
-        audio: Optional[AudioInput] = None,
-        output_labels: Optional[bool] = False,
-        **kwargs: Unpack[AudioFlamingo3ProcessorKwargs],
-    ) -> BatchFeature:
-        r"""
-        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
-        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
-        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
-        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
-        the text is tokenized as-is (LM-only behavior).
-
-        Args:
-            text (`str` or `list[str]`):
-                Input sequence or batch of sequences.
-            audio (`np.ndarray` or `list[np.ndarray]`):
-                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
-                `audio` inputs.
-            output_labels (bool, *optional*, default=False):
-                Whether to return labels for training.
-
-        Returns:
-            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
-            audio features (`input_features`, `input_features_mask`).
-        """
-
-        # Merge defaults with user kwargs
-        call_kwargs = self._merge_kwargs(
-            AudioFlamingo3ProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        text_kwargs = call_kwargs["text_kwargs"]
-        audio_kwargs = call_kwargs["audio_kwargs"]
-        return_tensors = text_kwargs.get("return_tensors")
-        if return_tensors != "pt":
-            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
-
-        if isinstance(text, str):
-            text = [text]
-        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
-            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
-
-        audio_inputs = {}
-        if audio is not None:
-            audio = make_list_of_audio(audio)
-            if len(text) != len(audio):
-                raise ValueError(f"Got {len(text)} text but {len(audio)} audios; they must match 1:1.")
-
-            # Determine number of chunks per sample, and flatten
-            window_size = int(audio_kwargs["sampling_rate"] * audio_kwargs["chunk_length"])
-            max_windows = int(MAX_AUDIO_LEN // audio_kwargs["chunk_length"])
-
-            per_sample_windows: list[int] = []
-            flat_chunks: list[np.ndarray] = []
-
-            for audio_el in audio:
-                n_samples = int(audio_el.shape[0])
-                n_win = max(1, (n_samples + window_size - 1) // window_size)
-                if n_win > max_windows:
-                    logger.warning(
-                        f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {MAX_AUDIO_LEN}s; truncating to first {MAX_AUDIO_LEN}s."
-                    )
-                    n_win = max_windows
-                per_sample_windows.append(n_win)
-
-                time_cap = min(n_samples, n_win * window_size)
-                for i in range(n_win):
-                    start = i * window_size
-                    end = min((i + 1) * window_size, time_cap)
-                    flat_chunks.append(audio_el[start:end])
-
-            # Feature extraction
-            audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
-            padding_mask = audio_inputs.pop("attention_mask")
-            audio_inputs["input_features_mask"] = padding_mask
-
-            # Compute sequence lengths token counting
-            audio_lenghts = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
-            conv_output_lengths = (audio_lenghts - 1) // 2 + 1  # After conv2 downsampling
-            audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1  # After avg pooling
-
-            # expand audio tokens in text
-            for i, audio_length in enumerate(audio_tokens_lengths):
-                expanded = re.sub(re.escape(self.audio_token), self.audio_token * audio_length, text[i])
-                text[i] = expanded
-
-        # Tokenize
-        text_inputs = self.tokenizer(text, **text_kwargs)
-
-        data = {**text_inputs, **audio_inputs}
-        if output_labels:
-            labels = data["input_ids"].clone()
-            labels[labels == self.audio_token_id] = -100
-            labels[labels == self.tokenizer.pad_token_id] = -100
-            data["labels"] = labels
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-    @property
-    def model_input_names(self) -> list[str]:
-        tok_names = self.tokenizer.model_input_names
-        fea_names = self.feature_extractor.model_input_names
-        return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask"]))
-
-    def apply_transcription_request(
-        self,
-        audio: Union[str, list[str], AudioInput],
-        prompt: Optional[Union[str, list[str]]] = None,
-        **kwargs: Unpack[AudioFlamingo3ProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.
-
-        Args:
-            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
-                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
-            prompt (`str` or `list[str]`, *optional*):
-                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
-                each sample uses `"Transcribe the input speech."`.
-            **kwargs:
-                Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
-                `text_kwargs`, `audio_kwargs`, ...).
-
-        Returns:
-            [`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].
-
-        """
-
-        if isinstance(audio, str):
-            audio_items: list[Union[str, np.ndarray]] = [audio]
-        elif isinstance(audio, (list, tuple)) and audio and all(isinstance(el, str) for el in audio):
-            audio_items = list(audio)
-        else:
-            audio_items = list(make_list_of_audio(audio))
-            if is_torch_available():
-                audio_items = [el.detach().cpu().numpy() if isinstance(el, torch.Tensor) else el for el in audio_items]
-
-        batch_size = len(audio_items)
-        if batch_size == 0:
-            raise ValueError("`audio` must contain at least one sample.")
-
-        if prompt is None:
-            prompts = [DEFAULT_TRANSCRIPTION_PROMPT] * batch_size
-        elif isinstance(prompt, str):
-            prompts = [prompt] * batch_size
-        elif isinstance(prompt, (list, tuple)):
-            if len(prompt) != batch_size:
-                raise ValueError(
-                    f"Received {len(prompt)} prompt(s) for {batch_size} audio sample(s); counts must match."
-                )
-            prompts = []
-            for item in prompt:
-                if item is None:
-                    prompts.append(DEFAULT_TRANSCRIPTION_PROMPT)
-                elif isinstance(item, str):
-                    prompts.append(item)
-                else:
-                    raise TypeError("Each prompt must be a string or `None`.")
-        else:
-            raise TypeError("`prompt` must be a string, a sequence of strings, or `None`.")
-
-        conversations = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt_text},
-                        {"type": "audio", "path": audio_item}
-                        if isinstance(audio_item, str)
-                        else {"type": "audio", "audio": audio_item},
-                    ],
-                }
-            ]
-            for prompt_text, audio_item in zip(prompts, audio_items)
-        ]
-
-        return self.apply_chat_template(
-            conversations,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            **kwargs,
-        )
-
-    def batch_decode(self, *args, strip_prefix=False, **kwargs):
-        """
-        Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
-        was trained to produce.
-
-        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is \"...\"."`.
-        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
-        """
-        decoded = self.tokenizer.batch_decode(*args, **kwargs)
-        if strip_prefix:
-            decoded = [self._strip_assistant_prefix_and_quotes(text) for text in decoded]
-        return decoded
-
-    def _strip_assistant_prefix_and_quotes(self, text: str) -> str:
-        """
-        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
-        """
-
-        stripped = text.strip()
-
-        for prefix in (
-            "The spoken content of the audio is",
-            "The transcription of the audio is",
-        ):
-            if stripped.startswith(prefix):
-                stripped = stripped[len(prefix) :].strip()
-                break
-
-        if stripped.endswith("."):
-            stripped = stripped[:-1].strip()
-
-        if len(stripped) >= 2 and stripped[0] == stripped[-1] and stripped[0] in {"'", '"'}:
-            stripped = stripped[1:-1].strip()
-
-        return stripped
-
-
-__all__ = ["AudioFlamingo3Processor"]
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -45,8 +45,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
        ("aria", "AriaConfig"),
        ("aria_text", "AriaTextConfig"),
        ("audio-spectrogram-transformer", "ASTConfig"),
-        ("audioflamingo3", "AudioFlamingo3Config"),
-        ("audioflamingo3_encoder", "AudioFlamingo3EncoderConfig"),
        ("autoformer", "AutoformerConfig"),
        ("aya_vision", "AyaVisionConfig"),
        ("bamba", "BambaConfig"),
@ -225,7 +223,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
        ("layoutlm", "LayoutLMConfig"),
        ("layoutlmv2", "LayoutLMv2Config"),
        ("layoutlmv3", "LayoutLMv3Config"),
-        ("layoutxlm", "LayoutLMv2Config"),
        ("led", "LEDConfig"),
        ("levit", "LevitConfig"),
        ("lfm2", "Lfm2Config"),
@ -479,8 +476,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
        ("aria", "Aria"),
        ("aria_text", "AriaText"),
        ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
-        ("audioflamingo3", "AudioFlamingo3"),
-        ("audioflamingo3_encoder", "AudioFlamingo3Encoder"),
        ("autoformer", "Autoformer"),
        ("aya_vision", "AyaVision"),
        ("bamba", "Bamba"),
@ -964,7 +959,6 @@ DEPRECATED_MODELS = [

 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
    [
-        ("audioflamingo3_encoder", "audioflamingo3"),
        ("openai-gpt", "openai"),
        ("data2vec-audio", "data2vec"),
        ("data2vec-text", "data2vec"),
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@ -15,6 +15,7 @@
 """AutoFeatureExtractor class."""

 import importlib
+import json
 import os
 from collections import OrderedDict
 from typing import Optional, Union
@ -23,7 +24,7 @@ from typing import Optional, Union
 from ...configuration_utils import PreTrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...feature_extraction_utils import FeatureExtractionMixin
-from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, cached_file, logging, safe_load_json_file
+from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, cached_file, logging
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
    CONFIG_MAPPING_NAMES,
@ -40,7 +41,6 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
        ("clap", "ClapFeatureExtractor"),
        ("clvp", "ClvpFeatureExtractor"),
-        ("csm", "EncodecFeatureExtractor"),
        ("dac", "DacFeatureExtractor"),
        ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
        ("dia", "DiaFeatureExtractor"),
@ -49,20 +49,14 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("granite_speech", "GraniteSpeechFeatureExtractor"),
        ("hubert", "Wav2Vec2FeatureExtractor"),
        ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
-        ("markuplm", "MarkupLMFeatureExtractor"),
        ("mctct", "MCTCTFeatureExtractor"),
        ("mimi", "EncodecFeatureExtractor"),
        ("moonshine", "Wav2Vec2FeatureExtractor"),
        ("moshi", "EncodecFeatureExtractor"),
-        ("musicgen", "EncodecFeatureExtractor"),
-        ("musicgen_melody", "MusicgenMelodyFeatureExtractor"),
        ("parakeet_ctc", "ParakeetFeatureExtractor"),
        ("parakeet_encoder", "ParakeetFeatureExtractor"),
        ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
        ("pop2piano", "Pop2PianoFeatureExtractor"),
-        ("qwen2_5_omni", "WhisperFeatureExtractor"),
-        ("qwen2_audio", "WhisperFeatureExtractor"),
-        ("qwen3_omni_moe", "WhisperFeatureExtractor"),
        ("seamless_m4t", "SeamlessM4TFeatureExtractor"),
        ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"),
        ("sew", "Wav2Vec2FeatureExtractor"),
@ -72,7 +66,6 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("unispeech", "Wav2Vec2FeatureExtractor"),
        ("unispeech-sat", "Wav2Vec2FeatureExtractor"),
        ("univnet", "UnivNetFeatureExtractor"),
-        ("voxtral", "WhisperFeatureExtractor"),
        ("wav2vec2", "Wav2Vec2FeatureExtractor"),
        ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"),
        ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
@ -174,10 +167,9 @@ def get_feature_extractor_config(
    feature_extractor.save_pretrained("feature-extractor-test")
    feature_extractor_config = get_feature_extractor_config("feature-extractor-test")
    ```"""
-    # Load with a priority given to the nested processor config, if available in repo
-    resolved_processor_file = cached_file(
+    resolved_config_file = cached_file(
        pretrained_model_name_or_path,
-        filename=PROCESSOR_NAME,
+        FEATURE_EXTRACTOR_NAME,
        cache_dir=cache_dir,
        force_download=force_download,
        proxies=proxies,
@ -186,37 +178,16 @@ def get_feature_extractor_config(
        local_files_only=local_files_only,
        _raise_exceptions_for_gated_repo=False,
        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
    )
-    resolved_feature_extractor_file = cached_file(
-        pretrained_model_name_or_path,
-        filename=FEATURE_EXTRACTOR_NAME,
-        cache_dir=cache_dir,
-        force_download=force_download,
-        proxies=proxies,
-        token=token,
-        revision=revision,
-        local_files_only=local_files_only,
-        _raise_exceptions_for_gated_repo=False,
-        _raise_exceptions_for_missing_entries=False,
-    )
-
-    # An empty list if none of the possible files is found in the repo
-    if not resolved_feature_extractor_file and not resolved_processor_file:
-        logger.info("Could not locate the feature extractor configuration file.")
+    if resolved_config_file is None:
+        logger.info(
+            "Could not locate the feature extractor configuration file, will try to use the model config instead."
+        )
        return {}

-    # Load feature_extractor dict. Priority goes as (nested config if found -> feature extractor config)
-    # We are downloading both configs because almost all models have a `processor_config.json` but
-    # not all of these are nested. We need to check if it was saved recently as nested or if it is legacy style
-    feature_extractor_dict = {}
-    if resolved_processor_file is not None:
-        processor_dict = safe_load_json_file(resolved_processor_file)
-        if "feature_extractor" in processor_dict:
-            feature_extractor_dict = processor_dict["feature_extractor"]
-
-    if resolved_feature_extractor_file is not None and feature_extractor_dict is None:
-        feature_extractor_dict = safe_load_json_file(resolved_feature_extractor_file)
-    return feature_extractor_dict
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)


 class AutoFeatureExtractor:
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -15,6 +15,7 @@
 """AutoImageProcessor class."""

 import importlib
+import json
 import os
 import warnings
 from collections import OrderedDict
@ -28,14 +29,12 @@ from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...utils import (
    CONFIG_NAME,
    IMAGE_PROCESSOR_NAME,
-    PROCESSOR_NAME,
    cached_file,
    is_timm_config_dict,
    is_timm_local_checkpoint,
    is_torchvision_available,
    is_vision_available,
    logging,
-    safe_load_json_file,
 )
 from ...utils.import_utils import requires
 from .auto_factory import _LazyAutoMapping
@ -63,9 +62,7 @@ else:
            ("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
-            ("altclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("aria", ("AriaImageProcessor", None)),
-            ("aya_vision", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
            ("beit", ("BeitImageProcessor", "BeitImageProcessorFast")),
            ("bit", ("BitImageProcessor", "BitImageProcessorFast")),
            ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")),
@ -76,8 +73,6 @@ else:
            ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("cohere2_vision", (None, "Cohere2VisionImageProcessorFast")),
-            ("colpali", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
-            ("colqwen2", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")),
            ("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
@ -100,10 +95,8 @@ else:
            ("efficientformer", ("EfficientFormerImageProcessor", None)),
            ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
            ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
-            ("emu3", ("Emu3ImageProcessor", None)),
            ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
            ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
-            ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
            ("fuyu", ("FuyuImageProcessor", "FuyuImageProcessorFast")),
            ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
@ -121,13 +114,11 @@ else:
            ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")),
            ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
-            ("internvl", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
            ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
            ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")),
            ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
            ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
-            ("layoutxlm", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessor")),
            ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
            ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
            ("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")),
@ -150,7 +141,6 @@ else:
            ("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
            ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
-            ("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")),
            ("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
            ("ovis2", ("Ovis2ImageProcessor", "Ovis2ImageProcessorFast")),
            ("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")),
@ -165,17 +155,14 @@ else:
            ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
            ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")),
            ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")),
-            ("qwen2_5_omni", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
-            ("qwen3_omni_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")),
            ("sam", ("SamImageProcessor", "SamImageProcessorFast")),
            ("sam2", (None, "Sam2ImageProcessorFast")),
-            ("sam2_video", (None, "Sam2ImageProcessorFast")),
            ("sam_hq", ("SamImageProcessor", "SamImageProcessorFast")),
            ("segformer", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
            ("seggpt", ("SegGptImageProcessor", None)),
@ -193,14 +180,12 @@ else:
            ("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")),
            ("timesformer", ("VideoMAEImageProcessor", None)),
            ("timm_wrapper", ("TimmWrapperImageProcessor", None)),
-            ("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("tvlt", ("TvltImageProcessor", None)),
            ("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")),
            ("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
            ("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
            ("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")),
-            ("video_llava", ("VideoLlavaImageProcessor", None)),
            ("videomae", ("VideoMAEImageProcessor", None)),
            ("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")),
            ("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
@ -320,10 +305,9 @@ def get_image_processor_config(
    image_processor.save_pretrained("image-processor-test")
    image_processor_config = get_image_processor_config("image-processor-test")
    ```"""
-    # Load with a priority given to the nested processor config, if available in repo
-    resolved_processor_file = cached_file(
+    resolved_config_file = cached_file(
        pretrained_model_name_or_path,
-        filename=PROCESSOR_NAME,
+        IMAGE_PROCESSOR_NAME,
        cache_dir=cache_dir,
        force_download=force_download,
        proxies=proxies,
@ -332,38 +316,16 @@ def get_image_processor_config(
        local_files_only=local_files_only,
        _raise_exceptions_for_gated_repo=False,
        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
    )
-    resolved_image_processor_file = cached_file(
-        pretrained_model_name_or_path,
-        filename=IMAGE_PROCESSOR_NAME,
-        cache_dir=cache_dir,
-        force_download=force_download,
-        proxies=proxies,
-        token=token,
-        revision=revision,
-        local_files_only=local_files_only,
-        _raise_exceptions_for_gated_repo=False,
-        _raise_exceptions_for_missing_entries=False,
-    )
-
-    # An empty list if none of the possible files is found in the repo
-    if not resolved_image_processor_file and not resolved_processor_file:
-        logger.info("Could not locate the image processor configuration file.")
+    if resolved_config_file is None:
+        logger.info(
+            "Could not locate the image processor configuration file, will try to use the model config instead."
+        )
        return {}

-    # Load image_processor dict. Priority goes as (nested config if found -> image processor config)
-    # We are downloading both configs because almost all models have a `processor_config.json` but
-    # not all of these are nested. We need to check if it was saved recently as nested or if it is legacy style
-    image_processor_dict = {}
-    if resolved_processor_file is not None:
-        processor_dict = safe_load_json_file(resolved_processor_file)
-        if "image_processor" in processor_dict:
-            image_processor_dict = processor_dict["image_processor"]
-
-    if resolved_image_processor_file is not None and image_processor_dict is None:
-        image_processor_dict = safe_load_json_file(resolved_image_processor_file)
-
-    return image_processor_dict
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)


 def _warning_fast_image_processor_available(fast_class):
@ -562,9 +524,10 @@ class AutoImageProcessor:
                )
                use_fast = False
            if use_fast:
-                # Check if the fast image processor class exists
-                image_processor_class_fast = get_image_processor_class_from_name(image_processor_type)
-                if image_processor_class_fast is None:
+                for image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.values():
+                    if image_processor_type in image_processors:
+                        break
+                else:
                    image_processor_type = image_processor_type[:-4]
                    use_fast = False
                    logger.warning_once(
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -53,8 +53,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("aria", "AriaModel"),
        ("aria_text", "AriaTextModel"),
        ("audio-spectrogram-transformer", "ASTModel"),
-        ("audioflamingo3", "AudioFlamingo3ForConditionalGeneration"),
-        ("audioflamingo3_encoder", "AudioFlamingo3Encoder"),
        ("autoformer", "AutoformerModel"),
        ("aya_vision", "AyaVisionModel"),
        ("bamba", "BambaModel"),
@ -447,7 +445,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
    [
        # Model for pre-training mapping
        ("albert", "AlbertForPreTraining"),
-        ("audioflamingo3", "AudioFlamingo3ForConditionalGeneration"),
        ("bart", "BartForConditionalGeneration"),
        ("bert", "BertForPreTraining"),
        ("big_bird", "BigBirdForPreTraining"),
@ -1162,7 +1159,6 @@ MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
    [
        # Model for Seq2Seq Causal LM mapping
-        ("audioflamingo3", "AudioFlamingo3ForConditionalGeneration"),
        ("bart", "BartForConditionalGeneration"),
        ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
        ("blenderbot", "BlenderbotForConditionalGeneration"),
@ -1704,7 +1700,6 @@ MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
        ("dinov2", "Dinov2Backbone"),
        ("dinov2_with_registers", "Dinov2WithRegistersBackbone"),
        ("dinov3_convnext", "DINOv3ConvNextBackbone"),
-        ("dinov3_vit", "DINOv3ViTBackbone"),
        ("focalnet", "FocalNetBackbone"),
        ("hgnet_v2", "HGNetV2Backbone"),
        ("hiera", "HieraBackbone"),
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@ -48,7 +48,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("align", "AlignProcessor"),
        ("altclip", "AltCLIPProcessor"),
        ("aria", "AriaProcessor"),
-        ("audioflamingo3", "AudioFlamingo3Processor"),
        ("aya_vision", "AyaVisionProcessor"),
        ("bark", "BarkProcessor"),
        ("blip", "BlipProcessor"),
@ -108,7 +107,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("mllama", "MllamaProcessor"),
        ("mm-grounding-dino", "GroundingDinoProcessor"),
        ("moonshine", "Wav2Vec2Processor"),
-        ("omdet-turbo", "OmDetTurboProcessor"),
        ("oneformer", "OneFormerProcessor"),
        ("ovis2", "Ovis2Processor"),
        ("owlv2", "Owlv2Processor"),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -72,7 +72,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
-        ("altclip", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
        ("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
@ -157,7 +156,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
        ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
        ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
-        ("cohere2_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
        ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("colqwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
        ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
@ -226,7 +224,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
-        ("donut", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "dpr",
            (
@ -241,7 +238,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
        ("esm", ("EsmTokenizer", None)),
-        ("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "exaone4",
            (
@ -256,13 +252,10 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None),
        ),
        ("flaubert", ("FlaubertTokenizer", None)),
-        ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
-        ("florence2", ("BartTokenizer", "BartTokenizerFast" if is_tokenizers_available() else None)),
        ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
        ("fsmt", ("FSMTTokenizer", None)),
        ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
-        ("fuyu", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "gemma",
            (
@ -311,7 +304,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("got_ocr2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
        ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
@ -322,7 +314,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
        ("granite", ("GPT2Tokenizer", None)),
-        ("granite_speech", ("GPT2Tokenizer", None)),
        ("granitemoe", ("GPT2Tokenizer", None)),
        ("granitemoehybrid", ("GPT2Tokenizer", None)),
        ("granitemoeshared", ("GPT2Tokenizer", None)),
@ -362,14 +353,11 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("kyutai_speech_to_text", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
        ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
        ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
        ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
        ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
-        ("lfm2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("lfm2_vl", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
        (
            "llama",
@ -410,7 +398,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
        ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
        ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
-        ("markuplm", ("MarkupLMTokenizer", "MarkupLMTokenizerFast" if is_tokenizers_available() else None)),
        (
            "mbart",
            (
@ -497,7 +484,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
                "NllbTokenizerFast" if is_tokenizers_available() else None,
            ),
        ),
-        ("nougat", (None, "NougatTokenizerFast" if is_tokenizers_available() else None)),
        (
            "nystromformer",
            (
@ -519,7 +505,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None),
        ),
        ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
-        ("ovis2", (None, "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
        ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
@ -545,7 +530,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
                None,
            ),
        ),
-        ("perception_lm", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "persimmon",
            (
@ -555,7 +539,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ),
        ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
        ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
-        ("phi4_multimodal", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("phobert", ("PhobertTokenizer", None)),
        ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
@ -569,7 +552,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
-        ("pop2piano", ("Pop2PianoTokenizer", None)),
        ("prophetnet", ("ProphetNetTokenizer", None)),
        ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        (
@ -676,7 +658,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("smolvlm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
        ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
        ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
@ -711,7 +692,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("tapas", ("TapasTokenizer", None)),
        ("tapex", ("TapexTokenizer", None)),
        ("transfo-xl", ("TransfoXLTokenizer", None)),
-        ("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
        ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        (
            "udop",
@ -727,14 +707,9 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
                "T5TokenizerFast" if is_tokenizers_available() else None,
            ),
        ),
-        ("video_llama_3", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
        ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
-        (
-            "vision_text_dual_encoder",
-            ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
-        ),
        ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("vits", ("VitsTokenizer", None)),
        (
@ -750,7 +725,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)),
        ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
        ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
-        ("wav2vec2_with_lm", ("Wav2Vec2CTCTokenizer", None)),
        ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
        ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        (
@ -1186,7 +1160,7 @@ class AutoTokenizer:
                The configuration corresponding to the model to register.
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                The slow tokenizer to register.
-            fast_tokenizer_class ([`PreTrainedTokenizerFast`], *optional*):
+            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
--- a/src/transformers/models/auto/video_processing_auto.py
+++ b/src/transformers/models/auto/video_processing_auto.py
@ -15,6 +15,7 @@
 """AutoVideoProcessor class."""

 import importlib
+import json
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Optional, Union
@ -22,16 +23,7 @@ from typing import TYPE_CHECKING, Optional, Union
 # Build the list of all video processors
 from ...configuration_utils import PreTrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
-from ...utils import (
-    CONFIG_NAME,
-    IMAGE_PROCESSOR_NAME,
-    PROCESSOR_NAME,
-    VIDEO_PROCESSOR_NAME,
-    cached_file,
-    is_torchvision_available,
-    logging,
-    safe_load_json_file,
-)
+from ...utils import CONFIG_NAME, VIDEO_PROCESSOR_NAME, cached_file, is_torchvision_available, logging
 from ...utils.import_utils import requires
 from ...video_processing_utils import BaseVideoProcessor
 from .auto_factory import _LazyAutoMapping
@ -68,7 +60,6 @@ else:
            ("qwen3_vl_moe", "Qwen3VLVideoProcessor"),
            ("sam2_video", "Sam2VideoVideoProcessor"),
            ("smolvlm", "SmolVLMVideoProcessor"),
-            ("video_llama_3", "VideoLlama3VideoProcessor"),
            ("video_llava", "VideoLlavaVideoProcessor"),
            ("videomae", "VideoMAEVideoProcessor"),
            ("vjepa2", "VJEPA2VideoProcessor"),
@ -176,59 +167,24 @@ def get_video_processor_config(
    video_processor.save_pretrained("video-processor-test")
    video_processor = get_video_processor_config("video-processor-test")
    ```"""
-    # Load with a priority given to the nested processor config, if available in repo
-    resolved_processor_file = cached_file(
+    resolved_config_file = cached_file(
        pretrained_model_name_or_path,
-        filename=PROCESSOR_NAME,
+        VIDEO_PROCESSOR_NAME,
        cache_dir=cache_dir,
        force_download=force_download,
        proxies=proxies,
        token=token,
        revision=revision,
        local_files_only=local_files_only,
-        _raise_exceptions_for_gated_repo=False,
-        _raise_exceptions_for_missing_entries=False,
    )
-    resolved_video_processor_files = [
-        resolved_file
-        for filename in [VIDEO_PROCESSOR_NAME, IMAGE_PROCESSOR_NAME]
-        if (
-            resolved_file := cached_file(
-                pretrained_model_name_or_path,
-                filename=filename,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                token=token,
-                revision=revision,
-                local_files_only=local_files_only,
-                _raise_exceptions_for_gated_repo=False,
-                _raise_exceptions_for_missing_entries=False,
-                _raise_exceptions_for_connection_errors=False,
-            )
+    if resolved_config_file is None:
+        logger.info(
+            "Could not locate the video processor configuration file, will try to use the model config instead."
        )
-        is not None
-    ]
-    resolved_video_processor_file = resolved_video_processor_files[0] if resolved_video_processor_files else None
-
-    # An empty list if none of the possible files is found in the repo
-    if not resolved_video_processor_file and not resolved_processor_file:
-        logger.info("Could not locate the video processor configuration file.")
        return {}

-    # Load video_processor dict. Priority goes as (nested config if found -> video processor config -> image processor config)
-    # We are downloading both configs because almost all models have a `processor_config.json` but
-    # not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
-    video_processor_dict = {}
-    if resolved_processor_file is not None:
-        processor_dict = safe_load_json_file(resolved_processor_file)
-        if "video_processor" in processor_dict:
-            video_processor_dict = processor_dict["video_processor"]
-
-    if resolved_video_processor_file is not None and video_processor_dict is None:
-        video_processor_dict = safe_load_json_file(resolved_video_processor_file)
-
-    return video_processor_dict
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)


@requires(backends=("vision", "torchvision"))
@ -335,7 +291,7 @@ class AutoVideoProcessor:

                # Some models have different image processors, e.g. InternVL uses GotOCRImageProcessor
                # We cannot use GotOCRVideoProcessor when falling back for BC and should try to infer from config later on
-                if video_processor_class_from_name(video_processor_class_inferred) is not None:
+                if video_processor_class_inferred in VIDEO_PROCESSOR_MAPPING_NAMES.values():
                    video_processor_class = video_processor_class_inferred
            if "AutoImageProcessor" in config_dict.get("auto_map", {}):
                image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"]
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@ -70,6 +70,10 @@ class AyaVisionProcessor(ProcessorMixin):
            in a chat into a tokenizable string.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@ -49,6 +49,9 @@ class BarkProcessor(ProcessorMixin):

    """

+    tokenizer_class = "AutoTokenizer"
+    attributes = ["tokenizer"]
+
    preset_shape = {
        "semantic_prompt": 1,  # 1D array of shape (X,)
        "coarse_prompt": 2,  # 2D array of shape (2,X)
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@ -53,6 +53,10 @@ class BlipProcessor(ProcessorMixin):
            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
    def __init__(self, image_processor, tokenizer, **kwargs):
        tokenizer.return_token_type_ids = False
        super().__init__(image_processor, tokenizer)
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@ -60,6 +60,10 @@ class Blip2Processor(ProcessorMixin):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
        tokenizer.return_token_type_ids = False
        if not hasattr(tokenizer, "image_token"):
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@ -54,6 +54,9 @@ class BridgeTowerProcessor(ProcessorMixin):
            An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BridgeTowerImageProcessor"
+    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
    valid_processor_kwargs = BridgeTowerProcessorKwargs

    def __init__(self, image_processor, tokenizer):
--- a/src/transformers/models/bros/convert_bros_to_pytorch.py
+++ b/src/transformers/models/bros/convert_bros_to_pytorch.py
@ -138,7 +138,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
-        help="Whether or not to push the converted model and processor to the Hugging Face hub.",
+        help="Whether or not to push the converted model and processor to the 🤗 hub.",
    )

    args = parser.parse_args()
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@ -46,6 +46,8 @@ class BrosProcessor(ProcessorMixin):
            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
    """

+    attributes = ["tokenizer"]
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
    valid_processor_kwargs = BrosProcessorKwargs

    def __init__(self, tokenizer=None, **kwargs):
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@ -69,6 +69,10 @@ class ChameleonProcessor(ProcessorMixin):
            The special token used to indicate image in the text.
    """

+    attributes = ["image_processor", "tokenizer"]
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    image_processor_class = "ChameleonImageProcessor"
+
    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
        self.image_seq_length = image_seq_length
        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@ -34,6 +34,10 @@ class ChineseCLIPProcessor(ProcessorMixin):
            The tokenizer is a required input.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast")
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        super().__init__(image_processor, tokenizer)

--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@ -42,6 +42,9 @@ class ClapProcessor(ProcessorMixin):
            The tokenizer is a required input.
    """

+    feature_extractor_class = "ClapFeatureExtractor"
+    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
+
    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)

--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@ -33,6 +33,10 @@ class CLIPProcessor(ProcessorMixin):
            The tokenizer is a required input.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        super().__init__(image_processor, tokenizer)

--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@ -257,9 +257,7 @@ if __name__ == "__main__":
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model to the Hugging Face hub.",
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )

    args = parser.parse_args()
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@ -34,6 +34,10 @@ class CLIPSegProcessor(ProcessorMixin):
            The tokenizer is a required input.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
+    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        super().__init__(image_processor, tokenizer)

--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@ -38,6 +38,9 @@ class ClvpProcessor(ProcessorMixin):
            An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
    """

+    feature_extractor_class = "ClvpFeatureExtractor"
+    tokenizer_class = "ClvpTokenizer"
+
    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)

--- a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
@ -129,7 +129,7 @@ class Cohere2VisionCausalLMOutputWithPast(ModelOutput):
@auto_docstring
 class Cohere2VisionPreTrainedModel(PreTrainedModel):
    config: Cohere2VisionConfig
-    base_model_prefix = "model"
+    base_model_prefix = ""
    input_modalities = ["image", "text"]
    supports_gradient_checkpointing = True
    _skip_keys_device_placement = "past_key_values"
--- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@ -26,7 +26,6 @@ from transformers.models.aya_vision.modeling_aya_vision import (
    AyaVisionForConditionalGeneration,
    AyaVisionModel,
    AyaVisionModelOutputWithPast,
-    AyaVisionPreTrainedModel,
 )
 from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast

@ -90,10 +89,6 @@ class Cohere2VisionCausalLMOutputWithPast(AyaVisionCausalLMOutputWithPast):
    pass


-class Cohere2VisionPreTrainedModel(AyaVisionPreTrainedModel):
-    base_model_prefix = "model"
-
-
 class Cohere2VisionModel(AyaVisionModel):
    _checkpoint_conversion_mapping = {}

@ -345,7 +340,7 @@ class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):

 __all__ = [
    "Cohere2VisionForConditionalGeneration",
-    "Cohere2VisionPreTrainedModel",
+    "Cohere2VisionPreTrainedModel",  # noqa: F822
    "Cohere2VisionModel",
    "Cohere2VisionImageProcessorFast",
 ]
--- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@ -47,6 +47,10 @@ class Cohere2VisionProcessor(ProcessorMixin):
            in a chat into a tokenizable string.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@ -93,6 +93,10 @@ class ColPaliProcessor(ProcessorMixin):
            A prefix to be used for the query.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
+    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@ -65,6 +65,9 @@ class ColQwen2Processor(ColPaliProcessor):
        query_prefix (`str`, *optional*): A prefix to be used for the query.
    """

+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@ -64,6 +64,11 @@ class ColQwen2Processor(ProcessorMixin):
        query_prefix (`str`, *optional*): A prefix to be used for the query.
    """

+    attributes = ["image_processor", "tokenizer"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@ -95,6 +95,10 @@ class CsmProcessor(ProcessorMixin):

    """

+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "EncodecFeatureExtractor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+
    def __init__(
        self,
        feature_extractor,
--- a/src/transformers/models/dac/convert_dac_checkpoint.py
+++ b/src/transformers/models/dac/convert_dac_checkpoint.py
@ -274,7 +274,7 @@ if __name__ == "__main__":
        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
    )
    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
    )
    parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor")
    args = parser.parse_args()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Luc Georges	2d9ca2766e	fix(ci): improper git sha checkout out	2025-11-07 17:11:17 +01:00
Luc Georges	df9d2a2288	refactor(benchmarks): change default lvls of cb bench config	2025-11-07 15:35:57 +00:00
Luc Georges	23c9735c4c	refactor(benchmarks): error if no benchmarks were run	2025-11-07 15:35:57 +00:00
Luc Georges	24d01637d8	test(cb): fix missing `require_read_token` import	2025-11-07 15:35:57 +00:00
Luc Georges	5d3f62c478	fix(benchmarks): revert `finally` removal but keep raising exception	2025-11-07 15:35:57 +00:00
Luc Georges	8dd6063c70	test(cb): comment out failing tests for now added a `FIXME` mark	2025-11-07 15:35:57 +00:00
Luc Georges	1b1bdc6f94	fix(benchmarks): raise the exception on failure instead of ignoring we catch the exception later on and raising it here helps debugging because it will be logged	2025-11-07 15:35:57 +00:00
Copilot	bd7bbf8c2c	fix(benchmarks): prevent version conflicts and ensure proper cleanup in continuous batching (#42063 ) * Initial plan * fix(benchmarks): ensure proper cleanup and remove transformers from requirements - Remove transformers from benchmark_v2/requirements.txt to prevent version conflicts - Add try-finally block to ensure ContinuousBatchingManager.stop() is always called - This fixes TypeError about unexpected 'streaming' argument and prevents OOM from improper cleanup Co-authored-by: McPatate <9112841+McPatate@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: McPatate <9112841+McPatate@users.noreply.github.com>	2025-11-07 15:35:56 +00:00
Luc Georges	de76311b53	fix(benchmarks): update `get_throughput` formula	2025-11-07 15:35:56 +00:00
Luc Georges	0848f9e775	tests(cb): fix attn impl	2025-11-07 15:35:56 +00:00
Luc Georges	8b376134a2	fix(benchmarks): invalid configuration when cb has attn_impl == sdpa	2025-11-07 15:35:56 +00:00
Luc Georges	d5fbaf3e95	fix(benchmarks): `stream` -> `streaming`	2025-11-07 15:35:56 +00:00
Luc Georges	b7b10bb8b1	refactor(cb): when stopping, block by default	2025-11-07 15:35:56 +00:00
Luc Georges	a69f1a87d5	refactor(ci): PR comments	2025-11-07 15:35:56 +00:00
Luc Georges	ed280fa5e4	feat(ci): add continuous batching to benchmarks	2025-11-07 15:35:56 +00:00