Release: v4.56.2

Processor load with multi-processing (#40786 )
push
2025-10-21 01:23:56 +08:00 · 2025-09-17 11:07:24 +02:00 · 2025-09-17 11:07:24 +02:00 · 2025-09-17 10:24:49 +02:00 · 2025-09-17 10:24:30 +02:00 · 2025-09-17 10:24:16 +02:00
705 changed files with 5707 additions and 84230 deletions
--- a/.github/workflows/collated-reports.yml
+++ b/.github/workflows/collated-reports.yml
@ -41,9 +41,3 @@ jobs:
            --job ${{ inputs.job }}                          \
            --report-repo-id ${{ inputs.report_repo_id }}    \
            --gpu-name ${{ inputs.gpu_name }}
-
-      - name: Upload collated reports
-        uses: actions/upload-artifact@v4
-        with:
-          name: collated_reports_${{ env.CI_SHA }}.json
-          path: collated_reports_${{ env.CI_SHA }}.json
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -4,17 +4,6 @@ on:
  push:
    branches: [ main ]

-env:
-  OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA"
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
-  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
-  TF_FORCE_GPU_ALLOW_GROWTH: true
-
 jobs:
  get_modified_models:
    name: "Get all modified files"
@ -25,111 +14,144 @@ jobs:
      - name: Check out code
        uses: actions/checkout@v4

-      - name: Get changed files
-        id: changed-files
-        uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
+      - name: Get changed files using `actions/github-script`
+        id: get-changed-files
+        uses: actions/github-script@v7
        with:
-          files: src/transformers/models/**
+          script: |
+            let files = [];
+            
+            // Only handle push events
+            if (context.eventName === 'push') {
+              const afterSha = context.payload.after;
+              const branchName = context.payload.ref.replace('refs/heads/', '');
+              
+              let baseSha;
+              
+              if (branchName === 'main') {
+                console.log('Push to main branch, comparing to parent commit');
+                // Get the parent commit of the pushed commit
+                const { data: commit } = await github.rest.repos.getCommit({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  ref: afterSha
+                });
+                baseSha = commit.parents[0]?.sha;
+                if (!baseSha) {
+                  throw new Error('No parent commit found for the pushed commit');
+                }
+              } else {
+                console.log(`Push to branch ${branchName}, comparing to main`);
+                baseSha = 'main';
+              }
+              
+              const { data: comparison } = await github.rest.repos.compareCommits({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                base: baseSha,
+                head: afterSha
+              });
+              
+              // Include added, modified, and renamed files
+              files = comparison.files
+                .filter(file => file.status === 'added' || file.status === 'modified' || file.status === 'renamed')
+                .map(file => file.filename);
+            }
+            
+            // Include all files under src/transformers/ (not just models subdirectory)
+            const filteredFiles = files.filter(file => 
+              file.startsWith('src/transformers/')
+            );
+            
+            core.setOutput('changed_files', filteredFiles.join(' '));
+            core.setOutput('any_changed', filteredFiles.length > 0 ? 'true' : 'false');

-      - name: Run step if only the files listed above change
-        if: steps.changed-files.outputs.any_changed == 'true'
-        id: set-matrix
+      - name: Parse changed files with Python
+        if: steps.get-changed-files.outputs.any_changed == 'true'
        env:
-          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+          CHANGED_FILES: ${{ steps.get-changed-files.outputs.changed_files }}
+        id: set-matrix
        run: |
-            model_arrays=()
-            for file in $ALL_CHANGED_FILES; do
-                model_path="${file#*models/}"
-                model_path="models/${model_path%%/*}"
-                if grep -qFx "$model_path" utils/important_models.txt; then
-                    # Append the file to the matrix string
-                    model_arrays+=("$model_path")
-                fi
-            done
-            matrix_string=$(printf '"%s", ' "${model_arrays[@]}" | sed 's/, $//')
-            echo "matrix=[$matrix_string]" >> $GITHUB_OUTPUT
-  test_modified_files:
+          python3 - << 'EOF'
+          import os
+          import sys
+          import json
+          
+          # Add the utils directory to Python path
+          sys.path.insert(0, 'utils')
+          
+          # Import the important models list
+          from important_files import IMPORTANT_MODELS
+          
+          print(f"Important models: {IMPORTANT_MODELS}")
+          
+          # Get the changed files from the previous step
+          changed_files_str = os.environ.get('CHANGED_FILES', '')
+          changed_files = changed_files_str.split() if changed_files_str else []
+          
+          # Filter to only Python files
+          python_files = [f for f in changed_files if f.endswith('.py')]
+          print(f"Python files changed: {python_files}")
+          
+          result_models = set()
+          
+          # Specific files that trigger all models
+          transformers_utils_files = [
+              'modeling_utils.py',
+              'modeling_rope_utils.py', 
+              'modeling_flash_attention_utils.py',
+              'modeling_attn_mask_utils.py',
+              'cache_utils.py',
+              'masking_utils.py',
+              'pytorch_utils.py'
+          ]
+          
+          # Single loop through all Python files
+          for file in python_files:
+              # Check for files under src/transformers/models/
+              if file.startswith('src/transformers/models/'):
+                  remaining_path = file[len('src/transformers/models/'):]
+                  if '/' in remaining_path:
+                      model_dir = remaining_path.split('/')[0]
+                      if model_dir in IMPORTANT_MODELS:
+                          result_models.add(model_dir)
+                          print(f"Added model directory: {model_dir}")
+              
+              # Check for specific files under src/transformers/ or src/transformers/generation/ files
+              elif file.startswith('src/transformers/generation/') or \
+                   (file.startswith('src/transformers/') and os.path.basename(file) in transformers_utils_files):
+                  print(f"Found core file: {file} - including all important models")
+                  result_models.update(IMPORTANT_MODELS)
+                  break  # No need to continue once we include all models
+          
+          # Convert to sorted list and create matrix
+          result_list = sorted(list(result_models))
+          print(f"Final model list: {result_list}")
+          
+          if result_list:
+              matrix_json = json.dumps(result_list)
+              print(f"matrix={matrix_json}")
+              
+              # Write to GITHUB_OUTPUT
+              with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
+                  f.write(f"matrix={matrix_json}\n")
+          else:
+              print("matrix=[]")
+              with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
+                  f.write("matrix=[]\n")
+          EOF
+
+  model-ci:
+    name: Model CI
+    uses: ./.github/workflows/self-scheduled.yml
    needs: get_modified_models
-    name: Slow & FA2 tests
-    runs-on:
-      group: aws-g5-4xlarge-cache
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
-    strategy:
-      fail-fast: false
-      matrix:
-        model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }}
-
-    steps:
-      - name: Check out code
-        uses: actions/checkout@v4
-
-      - name: Install locally transformers & other libs
-        run: |
-          apt install sudo
-          sudo -H pip install --upgrade pip
-          sudo -H pip uninstall -y transformers
-          sudo -H pip install -U -e ".[testing]"
-          MAX_JOBS=4 pip install flash-attn --no-build-isolation
-          pip install bitsandbytes
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Show installed libraries and their versions
-        run: pip freeze
-
-      - name: Run FA2 tests
-        id: run_fa2_tests
-        run:
-          pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_*
-
-      - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.model-name }}_fa2_tests
-          path: /transformers/reports/${{ matrix.model-name }}_fa2_tests
-
-      - name: Post to Slack
-        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
-          title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }}
-          status: ${{ steps.run_fa2_tests.conclusion}}
-          slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-
-      - name: Run integration tests
-        id: run_integration_tests
-        if: always()
-        run:
-          pytest -rsfE -k "IntegrationTest"  --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_*
-
-      - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: tests_integration_${{ matrix.model-name }}
-          path: /transformers/reports/tests_integration_${{ matrix.model-name }}
-
-      - name: Post to Slack
-        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
-          title: 🤗 Results of the Integration tests - ${{ matrix.model-name }}
-          status: ${{ steps.run_integration_tests.conclusion}}
-          slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-
-      - name: Tailscale # In order to be able to SSH when a test fails
-        if: ${{ runner.debug == '1'}}
-        uses: huggingface/tailscale-action@v1
-        with:
-          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
-          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
-          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-          waitForSSH: true
+    if: needs.get_modified_models.outputs.matrix != '' && needs.get_modified_models.outputs.matrix != '[]'
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#transformers-ci-push"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: push
+      report_repo_id: hf-internal-testing/transformers_ci_push
+      commit_sha: ${{ github.sha }}
+      models: ${{ needs.get_modified_models.outputs.matrix }}
+    secrets: inherit
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@ -12,12 +12,34 @@ on:
    branches:
      - run_ci_with_nightly_torch*

+# Used for `push` to easily modify the target workflow runs to compare against
+env:
+    prev_workflow_run_id: ""
+    other_workflow_run_id: ""
+
+
 jobs:
  build_nightly_torch_ci_images:
    name: Build CI Docker Images with nightly torch
    uses: ./.github/workflows/build-nightly-ci-docker-images.yml
    secrets: inherit

+  setup:
+    name: Setup
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Setup
+        run: |
+          mkdir "setup_values"
+          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
+          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: setup_values
+          path: setup_values
+
  model-ci:
    name: Model CI
    needs: build_nightly_torch_ci_images
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -31,7 +31,10 @@ on:
      commit_sha:
        required: false
        type: string
-
+      models:
+        default: ""
+        required: false
+        type: string

 env:
  HF_HOME: /mnt/cache
@ -68,7 +71,7 @@ jobs:
      - name: Update clone
        working-directory: /transformers
        run: |
-          git fetch && git checkout ${{ github.sha }}
+          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Cleanup
        working-directory: /transformers
@ -87,7 +90,7 @@ jobs:
        working-directory: /transformers/tests
        run: |
          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
            echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
@ -512,7 +515,7 @@ jobs:
      run_quantization_torch_gpu,
      run_extract_warnings
    ]
-    if: ${{ always() }}
+    if: always() && !cancelled()
    uses: ./.github/workflows/slack-report.yml
    with:
      job: ${{ inputs.job }}
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -36,7 +36,7 @@ jobs:
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-22.04
-    if: always()
+    if: always() && !cancelled()
    steps:
      - name: Preliminary job status
        shell: bash
@ -75,6 +75,8 @@ jobs:
          SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: ${{ inputs.ci_event }}
+          # This `CI_TITLE` would be empty for `schedule` or `workflow_run` events.
+          CI_TITLE: ${{ github.event.head_commit.message }}
          CI_SHA: ${{ inputs.commit_sha || github.sha }}
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
@ -91,7 +93,7 @@ jobs:
            python utils/notification_service.py "${{ inputs.quantization_matrix }}"
          else
            python utils/notification_service.py "${{ inputs.folder_slices }}"
-          fi          
+          fi

      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
      - name: Failure table artifacts
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -32,7 +32,10 @@ RUN python3 -m pip uninstall -y flax jax

 RUN python3 -m pip install --no-cache-dir -U timm

-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
+
+RUN python3 -m pip install --no-cache-dir pytesseract
+
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
@ -41,6 +44,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef

 # For bettertransformer
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
+# For kernels
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/kernels@main#egg=kernels

 # For video model testing
 RUN python3 -m pip install --no-cache-dir av
@ -52,7 +57,7 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
 RUN python3 -m pip install --no-cache-dir quanto

 # After using A10 as CI runner, let's run FA2 tests
-RUN python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"

 # TODO (ydshieh): check this again
 # `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
--- a/docker/transformers-gpu/Dockerfile
+++ b/docker/transformers-gpu/Dockerfile
@ -17,6 +17,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
    jupyter \
    tensorflow \
    torch
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/kernels@main#egg=kernels

 RUN git clone https://github.com/NVIDIA/apex
 RUN cd apex && \
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -373,6 +373,8 @@
    - sections:
      - local: model_doc/albert
        title: ALBERT
+      - local: model_doc/apertus
+        title: Apertus
      - local: model_doc/arcee
        title: Arcee
      - local: model_doc/bamba
--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -15,6 +15,7 @@ rendered properly in your Markdown viewer.
 -->

 # Caching
+
 Imagine you're having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?

 You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
@ -107,7 +108,7 @@ model_id = "meta-llama/Llama-2-7b-chat-hf"
 model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map=device)
 tokenizer = AutoTokenizer.from_pretrained(model_id)

-past_key_values = DynamicCache()
+past_key_values = DynamicCache(config=model.config)
 messages = [{"role": "user", "content": "Hello, what's your name."}]
 inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)

@ -138,7 +139,7 @@ The cache position tracks where to insert new tokens in the attention cache. It
 Cache position is used internally for two purposes:

 1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
-2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, like [`StaticCache`], that pre-allocates a specific cache length.
+2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, that pre-allocates a specific cache length.

 The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.

--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@ -33,6 +33,7 @@ Add the `gguf_file` parameter to [`~PreTrainedModel.from_pretrained`] to specify

 ```py
 # pip install gguf
+import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -227,7 +227,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)

 user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]

-past_key_values = DynamicCache()
+past_key_values = DynamicCache(config=model.config)

 messages = []
 for prompt in user_prompts:
--- a/docs/source/en/model_doc/apertus.md
+++ b/docs/source/en/model_doc/apertus.md
@ -0,0 +1,100 @@
+<!--Copyright 2025 The HuggingFace Team and the Swiss AI Initiative. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+# Apertus
+
+[Apertus](https://www.swiss-ai.org) is a family of large language models from the Swiss AI Initiative.
+
+> [!TIP]
+> Coming soon
+
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="swiss-ai/Apertus-8B",
+    dtype=torch.bfloat16,
+    device=0
+)
+pipeline("Plants create energy through a process known as")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "swiss-ai/Apertus-8B",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "swiss-ai/Apertus-8B",
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model swiss-ai/Apertus-8B --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## ApertusConfig
+
+[[autodoc]] ApertusConfig
+
+## ApertusModel
+
+[[autodoc]] ApertusModel
+    - forward
+
+## ApertusForCausalLM
+
+[[autodoc]] ApertusForCausalLM
+    - forward
+
+## ApertusForTokenClassification
+
+[[autodoc]] ApertusForTokenClassification
+    - forward
--- a/docs/source/en/model_doc/efficientloftr.md
+++ b/docs/source/en/model_doc/efficientloftr.md
@ -45,7 +45,7 @@ results = keypoint_matcher([url_0, url_1], threshold=0.9)
 print(results[0])
 # {'keypoint_image_0': {'x': ..., 'y': ...}, 'keypoint_image_1': {'x': ..., 'y': ...}, 'score': ...}
 ```
-<hfoption id="AutoModel">
+</hfoption>
 <hfoption id="AutoModel">

 ```py
@ -65,7 +65,7 @@ processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
 model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")

 inputs = processor(images, return_tensors="pt")
-with torch.no_grad():
+with torch.inference_mode():
    outputs = model(**inputs)

 # Post-process to get keypoints and matches
@ -92,7 +92,8 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    # EfficientLoFTR requires pairs of images
    images = [image1, image2]
    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
+    with torch.inference_mode():
+        outputs = model(**inputs)
    
    # Extract matching information
    keypoints = outputs.keypoints        # Keypoints in both images
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -150,7 +150,7 @@ visualizer("LLMs generate text through a process known as")
   )
   input_text = "LLMs generate text through a process known as"
   input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
-   past_key_values = DynamicCache()
+   past_key_values = DynamicCache(config=model.config)
   outputs = model.generate(**input_ids, max_new_tokens=50, past_key_values=past_key_values)
   print(tokenizer.decode(outputs[0], skip_special_tokens=True))
   ```
--- a/docs/source/en/model_doc/kosmos2_5.md
+++ b/docs/source/en/model_doc/kosmos2_5.md
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
-*This model was released on 2023-09-23 and added to Hugging Face Transformers on 2025-08-19.*
+*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-19.*

 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
@ -48,14 +48,14 @@ import requests
 from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device

-repo = "ydshieh/kosmos-2.5"
-device = f"{infer_device()}:0"
+repo = "microsoft/kosmos-2.5"
+device = "cuda:0"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)

 # sample image
-url = "https://huggingface.co/ydshieh/kosmos-2.5/resolve/main/receipt_00008.png"
+url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)

 prompt = "<md>"
@ -87,14 +87,14 @@ import requests
 from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device

-repo = "ydshieh/kosmos-2.5"
-device = f"{infer_device()}:0"
+repo = "microsoft/kosmos-2.5"
+device = "cuda:0"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)

 # sample image
-url = "https://huggingface.co/ydshieh/kosmos-2.5/resolve/main/receipt_00008.png"
+url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)

 # bs = 1
@ -160,12 +160,52 @@ image.save("output.png")
 </hfoptions>


-## Example
-**Markdown Task:** For usage instructions, please refer to [md.py](https://huggingface.co/ydshieh/kosmos-2.5/blob/main/md.py).
+## Chat version

-**OCR Task:** For usage instructions, please refer to [ocr.py](https://huggingface.co/ydshieh/kosmos-2.5/blob/main/ocr.py).
+The authors also released Kosmos-2.5 Chat, which is a chat version optimized for document understanding. You can use it like so:

+```python
+import re
+import torch
+import requests
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration

+repo = "microsoft/kosmos-2.5-chat"
+device = "cuda:0"
+dtype = torch.bfloat16
+
+model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo,
+                                                          device_map=device,
+                                                          torch_dtype=dtype,
+                                                          attn_implementation="flash_attention_2")
+processor = AutoProcessor.from_pretrained(repo)
+
+# sample image
+url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
+
+image = Image.open(requests.get(url, stream=True).raw)
+
+question = "What is the sub total of the receipt?"
+template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
+prompt = template.format(question)
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+height, width = inputs.pop("height"), inputs.pop("width")
+raw_width, raw_height = image.size
+scale_height = raw_height / height
+scale_width = raw_width / width
+
+inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
+inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
+generated_ids = model.generate(
+    **inputs,
+    max_new_tokens=1024,
+)
+
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text[0])
+```

 ## Kosmos2_5Config

--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@ -47,6 +47,8 @@ results = keypoint_matcher([url_0, url_1], threshold=0.9)
 print(results[0])
 # {'keypoint_image_0': {'x': ..., 'y': ...}, 'keypoint_image_1': {'x': ..., 'y': ...}, 'score': ...}
 ```
+
+</hfoption>
 <hfoption id="AutoModel">

 ```py
@ -66,7 +68,7 @@ processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
 model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")

 inputs = processor(images, return_tensors="pt")
-with torch.no_grad():
+with torch.inference_mode():
    outputs = model(**inputs)

 # Post-process to get keypoints and matches
@ -93,7 +95,8 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    # LightGlue requires pairs of images
    images = [image1, image2]
    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
+    with torch.inference_mode():
+        outputs = model(**inputs)
    
    # Extract matching information
    keypoints0 = outputs.keypoints0  # Keypoints in first image
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@ -25,7 +25,7 @@ rendered properly in your Markdown viewer.

 # OPT

-[OPT](https://huggingface.co/papers/2205.01068) is a suite of open-source decoder-only pre-trained transformers whose parameters range from 125M to 175B. OPT models are designed for casual language modeling and aim to enable responsible and reproducible research at scale. OPT-175B is comparable in performance to GPT-3 with only 1/7th the carbon footprint.
+[OPT](https://huggingface.co/papers/2205.01068) is a suite of open-source decoder-only pre-trained transformers whose parameters range from 125M to 175B. OPT models are designed for causal language modeling and aim to enable responsible and reproducible research at scale. OPT-175B is comparable in performance to GPT-3 with only 1/7th the carbon footprint.

 You can find all the original OPT checkpoints under the [OPT](https://huggingface.co/collections/facebook/opt-66ed00e15599f02966818844) collection.

--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@ -15,74 +15,126 @@ rendered properly in your Markdown viewer.
 -->
 *This model was released on 2024-09-17 and added to Hugging Face Transformers on 2024-09-14.*

-# Pixtral

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Pixtral

-The [Pixtral](https://huggingface.co/papers/2410.07073) model was released by the Mistral AI team in a [blog post](https://mistral.ai/news/pixtral-12b/). Pixtral is a multimodal version of [Mistral](mistral), incorporating a 400 million parameter vision encoder trained from scratch.
-
-The intro from the blog says the following:
-
-*Pixtral is trained to understand both natural images and documents, achieving 52.5% on the MMMU reasoning benchmark, surpassing a number of larger models. The model shows strong abilities in tasks such as chart and figure understanding, document question answering, multimodal reasoning and instruction following. Pixtral is able to ingest images at their natural resolution and aspect ratio, giving the user flexibility on the number of tokens used to process an image. Pixtral is also able to process any number of images in its long context window of 128K tokens. Unlike previous open-source models, Pixtral does not compromise on text benchmark performance to excel in multimodal tasks.*
+[Pixtral](https://huggingface.co/papers/2410.07073) is a multimodal model trained to understand natural images and documents. It accepts images in their natural resolution and aspect ratio without resizing or padding due to it's 2D RoPE embeddings. In addition, Pixtral has a long 128K token context window for processing a large number of images. Pixtral couples a 400M vision encoder with a 12B Mistral Nemo decoder.

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/pixtral_architecture.webp"
 alt="drawing" width="600"/>

 <small> Pixtral architecture. Taken from the <a href="https://mistral.ai/news/pixtral-12b/">blog post.</a> </small>

-Tips:
+You can find all the original Pixtral checkpoints under the [Mistral AI](https://huggingface.co/mistralai/models?search=pixtral) organization.

- Pixtral is a multimodal model, taking images and text as input, and producing text as output.
- This model follows the [Llava](llava) architecture. The model uses [`PixtralVisionModel`] for its vision encoder, and [`MistralForCausalLM`] for its language decoder.
- The main contribution is the 2d ROPE (rotary position embeddings) on the images, and support for arbitrary image sizes (the images are not padded together nor are they resized).
- Similar to [Llava](llava), the model internally replaces the `[IMG]` token placeholders by image embeddings from the vision encoder. The format for one or multiple prompts is the following:
-```
-"<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
-```
-Then, the processor will replace each `[IMG]` token with a number of `[IMG]` tokens that depend on the height and the width of each image. Each *row* of the image is separated by an `[IMG_BREAK]` token, and each image is separated by an `[IMG_END]` token. It's advised to use the `apply_chat_template` method of the processor, which takes care of all of this and formats the text for you. If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the [usage section](#usage) for more info.
+> [!TIP]
+> This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ).
+> Click on the Pixtral models in the right sidebar for more examples of how to apply Pixtral to different vision and language tasks.

+<hfoptions id="usage">

-This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ). The original code can be found [here](https://github.com/vllm-project/vllm/pull/8377).
-
-
-## Usage
-
-At inference time, it's advised to use the processor's `apply_chat_template` method, which correctly formats the prompt for the model:
+<hfoption id="AutoModel">

 ```python
+import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration

 model_id = "mistral-community/pixtral-12b"
+model = LlavaForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
 processor = AutoProcessor.from_pretrained(model_id)
-model = LlavaForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+url_dog = "https://picsum.photos/id/237/200/300"
+url_mountain = "https://picsum.photos/seed/picsum/200/300"

 chat = [
    {
      "role": "user", "content": [
        {"type": "text", "content": "Can this animal"}, 
-        {"type": "image", "url": "https://picsum.photos/id/237/200/300"}, 
+        {"type": "image", "url": url_dog}, 
        {"type": "text", "content": "live here?"}, 
-        {"type": "image", "url": "https://picsum.photos/seed/picsum/200/300"}
+        {"type": "image", "url" : url_mountain}
      ]
    }
 ]

-inputs = processor.apply_chat_template(
-    chat,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device)
-
+inputs = processor.apply_chat_template(chat, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors"pt").to(model.device)
 generate_ids = model.generate(**inputs, max_new_tokens=500)
 output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 ```

+</hfoption>
+
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the model to 4-bits.
+
+```python
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
+
+model_id = "mistral-community/pixtral-12b"
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+
+model = LlavaForConditionalGeneration.from_pretrained(
+    model_id,
+    quantization_config=quantization_config,
+    device_map="auto"
+)
+processor = AutoProcessor.from_pretrained(model_id)
+
+dog_url = "https://picsum.photos/id/237/200/300"
+mountain_url = "https://picsum.photos/seed/picsum/200/300"
+dog_image = Image.open(requests.get(dog_url, stream=True).raw)
+mountain_image = Image.open(requests.get(mountain_url, stream=True).raw)
+
+chat = [
+    {
+      "role": "user", "content": [
+        {"type": "text", "text": "Can this animal"},
+        {"type": "image"},
+        {"type": "text", "text": "live here?"},
+        {"type": "image"}
+      ]
+    }
+]
+
+prompt = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+inputs = processor(text=prompt, images=[dog_image, mountain_image], return_tensors="pt")
+
+inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
+
+generate_ids = model.generate(**inputs, max_new_tokens=100)
+output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(output)
+```
+
+## Notes
+
+- Pixtral uses [`PixtralVisionModel`] as the vision encoder and [`MistralForCausalLM`]  for its language decoder.
+- The model internally replaces `[IMG]` token placeholders with image embeddings.
+
+    ```py
+    "<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
+    ```
+
+    The `[IMG]` tokens are replaced with a number of `[IMG]` tokens that depend on the height and width of each image. Each row of the image is separated by a `[IMG_BREAK]` token and each image is separated by a `[IMG_END]` token. Use the [`~Processor.apply_chat_template`] method to handle these tokens for you.
+
 ## PixtralVisionConfig

 [[autodoc]] PixtralVisionConfig
--- a/docs/source/en/model_doc/segformer.md
+++ b/docs/source/en/model_doc/segformer.md
@ -1,113 +1,97 @@
 <!--Copyright 2021 The HuggingFace Team. All rights reserved.

-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
 http://www.apache.org/licenses/LICENSE-2.0

-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+⚠️ Note that this file is in Markdown but contains specific syntax
+for our doc-builder (similar to MDX) that may not render properly
+in your Markdown viewer.
 -->
 *This model was released on 2021-05-31 and added to Hugging Face Transformers on 2021-10-28.*

-# SegFormer
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# SegFormer

-The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://huggingface.co/papers/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping
-Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great
-results on image segmentation benchmarks such as ADE20K and Cityscapes.
+[SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://huggingface.co/papers/2105.15203) is a semantic segmentation model that combines a hierarchical Transformer encoder (Mix Transformer, MiT) with a lightweight all-MLP decoder. It avoids positional encodings and complex decoders and achieves state-of-the-art performance on benchmarks like ADE20K and Cityscapes. This simple and lightweight design is more efficient and scalable.

-The abstract from the paper is the following:
-
-*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with
-lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel
-hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding,
-thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution
-differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from
-different layers, and thus combining both local attention and global attention to render powerful representations. We
-show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our
-approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance
-and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters,
-being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on
-Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.*
-
-The figure below illustrates the architecture of SegFormer. Taken from the [original paper](https://huggingface.co/papers/2105.15203).
+The figure below illustrates the architecture of SegFormer.

 <img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png"/>

-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/NVlabs/SegFormer).
+You can find all the original SegFormer checkpoints under the [NVIDIA](https://huggingface.co/nvidia/models?search=segformer) organization.

-## Usage tips
+> [!TIP]
+> This model was contributed by [nielsr](https://huggingface.co/nielsr).
+>
+> Click on the SegFormer models in the right sidebar for more examples of how to apply SegFormer to different vision tasks.

- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head.
-  [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to
-  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on
-  top to perform semantic segmentation of images. In addition, there's
-  [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The
-  authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
-  away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on
-  ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
-  found on the [hub](https://huggingface.co/models?other=segformer).
- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
-  fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers)
-  to try out a SegFormer model on custom images.
- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`.
- One can use [`SegformerImageProcessor`] to prepare images and corresponding segmentation maps
-  for the model. Note that this image processor is fairly basic and does not include all data augmentations used in
-  the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
-  important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size,
-  such as 512x512 or 640x640, after which they are normalized.
- One additional thing to keep in mind is that one can initialize [`SegformerImageProcessor`] with
-  `do_reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated
-  segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels.
-  Therefore, `do_reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the
-  background class (i.e. it replaces 0 in the annotated maps by 255, which is the *ignore_index* of the loss function
-  used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as
-  background class and include this class as part of all labels. In that case, `do_reduce_labels` should be set to
-  `False`, as loss should also be computed for the background class.
- As most models, SegFormer comes in different sizes, the details of which can be found in the table below
-  (taken from Table 7 of the [original paper](https://huggingface.co/papers/2105.15203)).
+The example below demonstrates semantic segmentation with [`Pipeline`] or the [`AutoModel`] class.

-| **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
-| :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: |
-| MiT-b0            | [2, 2, 2, 2]  | [32, 64, 160, 256]  | 256                     | 3.7            | 70.5                  |
-| MiT-b1            | [2, 2, 2, 2]  | [64, 128, 320, 512] | 256                     | 14.0           | 78.7                  |
-| MiT-b2            | [3, 4, 6, 3]  | [64, 128, 320, 512] | 768                     | 25.4           | 81.6                  |
-| MiT-b3            | [3, 4, 18, 3] | [64, 128, 320, 512] | 768                     | 45.2           | 83.1                  |
-| MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
-| MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For
-SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://huggingface.co/papers/2105.15203).
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(task="image-segmentation", model="nvidia/segformer-b0-finetuned-ade-512-512", torch_dtype=torch.float16)
+pipeline("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForSemanticSegmentation
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+processor = AutoProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+model = AutoModelForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+
+inputs = processor(images=image, return_tensors="pt")
+outputs = model(**inputs)
+logits = outputs.logits # shape [batch, num_labels, height, width]
+```
+
+</hfoption>
+
+</hfoptions>
+
+
+
+## Notes
+
+- SegFormer works with **any input size**, padding inputs to be divisible by `config.patch_sizes`.
+- The most important preprocessing step is to randomly crop and pad all images to the same size (such as 512x512 or 640x640) and normalize afterwards.
+- Some datasets (ADE20k) uses the `0` index in the annotated segmentation as the background, but doesn't include the "background" class in its labels. The `do_reduce_labels` argument in [`SegformerForImageProcessor`] is used to reduce all labels by `1`. To make sure no loss is computed for the background class, it replaces `0` in the annotated maps by `255`, which is the `ignore_index` of the loss function.
+
+   Other datasets may include a background class and label though, in which case, `do_reduce_labels` should be `False`.
+
+```python
+from transformers import SegformerImageProcessor
+processor = SegformerImageProcessor(do_reduce_labels=True)
+```

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SegFormer.
-
-<PipelineTag pipeline="image-classification"/>
-
- [`SegformerForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- [Image classification task guide](../tasks/image_classification)
-
-Semantic segmentation:
-
- [`SegformerForSemanticSegmentation`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation).
- A blog on fine-tuning SegFormer on a custom dataset can be found [here](https://huggingface.co/blog/fine-tune-segformer).
- More demo notebooks on SegFormer (both inference + fine-tuning on a custom dataset) can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer).
- [Semantic segmentation task guide](../tasks/semantic_segmentation)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+- [Original SegFormer code (NVlabs)](https://github.com/NVlabs/SegFormer)  
+- [Fine-tuning blog post](https://huggingface.co/blog/fine-tune-segformer)  
+- [Tutorial notebooks (Niels Rogge)](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer)  
+- [Hugging Face demo space](https://huggingface.co/spaces/chansung/segformer-tf-transformers)  

 ## SegformerConfig

--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@ -68,7 +68,7 @@ processor = AutoImageProcessor.from_pretrained("magic-leap-community/superglue_o
 model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")

 inputs = processor(images, return_tensors="pt")
-with torch.no_grad():
+with torch.inference_mode():
    outputs = model(**inputs)

 # Post-process to get keypoints and matches
@ -95,7 +95,8 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    # SuperGlue requires pairs of images
    images = [image1, image2]
    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
+    with torch.inference_mode():
+        outputs = model(**inputs)
    
    # Extract matching information
    keypoints0 = outputs.keypoints0  # Keypoints in first image
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -21,10 +21,10 @@ Model users still import and use the single-file interface they've grown familia

 A linter "unravels" the modular file into a `modeling.py` file to preserve the single model, single file directory structure (modeling, processor, etc.). Inheritance is flattened to only a **single** level.

-Run the command below to automatically generate a `modeling.py` file from a modular file.
+Run the command below to automatically generate a `modeling.py` file from a modular file (assuming the snake lowercase name of the model you want to convert is `your_model`).

 ```bash
-python utils/modular_model_converter.py --files-to-parse src/transformers/models/<your_model>/modular_<your_model>.py
+python utils/modular_model_converter.py  your_model
 ```

 For example:
@ -35,12 +35,6 @@ For example:

 You should be able to write everything (tokenizer, image processor, model, config, etc.) in a modular and their corresponding single-files are generated.

-Run the command below to ensure the generated content matches `modular_<your_model>.py`.
-
-```bash
-python utils/check_modular_conversion.py --files src/transformers/models/<your_model>/modular_<your_model>.py
-```
-
 The example below demonstrates how a model can be added with significantly fewer lines of code with Modular Transformers.

 ### BERT and RoBERTa
@ -412,17 +406,17 @@ class MyNewDummyModel(DummyModel):
    del self.attribute
 ```

-## Explicit super() calls
+## Calling parent methods without unravelling their definition

-If you still want to inherit from `DummyModel` but don't want to remove the `self.attribute`, be explicit about which class' `super()` you're calling. The example below shows how to call the `super()` of `nn.Module` (unraveled code shown on the right)
+If you want to inherit from a module `DummyModule` and want to call `super()` WITHOUT unravelling the parent's code (that is, you want to call `super()` on the *generated* class parent), be explicit about which class' `super()` you're calling. The example below shows how to call the `super()` of `nn.Module` (unraveled code shown on the right). In this example, as `DummyModule` is itself a `nn.Module`, it makes sense to call `nn.Module.__init__(self)` as it's what was the initial intention. It's then unravelled as `super()` in `MyNewDummyModule` to follow Python's best-practices.

 ```py
-class MyNewDummyModel(DummyModel, nn.Module):        |     class MyNewDummyModel(nn.Module):
-                                                     |
-  def __init__(self, config: MyNewDummyConfig):      |       def __init__(self, config: MyNewDummyConfig):
-    nn.Module.__init__(config)                       |         super().__init__()
-    self.foo = config.foo                            |         self.foo = config.foo
-    ...                                              |         ...
+class MyNewDummyModule(DummyModule):                   |     class MyNewDummyModule(nn.Module):
+                                                       |
+  def __init__(self):                                  |       def __init__(self):
+    nn.Module.__init__(self)                           |         super().__init__()
+    self.foo = config.foo                              |         self.foo = config.foo
+    ...                                                |         ...
 ```

 ## Deleting unused methods
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -24,23 +24,23 @@ Use the Space below to help you pick a quantization method depending on your har

 | Quantization Method                       | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits         | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
 |-------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|--------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                            | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2          | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AQLM](./aqlm)                            | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🟢              | 🟢              | 1/2          | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
 | [AutoRound](./auto_round)                 | 🔴                   | 🟢               | 🟢          |   🔴        |   🔴                                |   🟢              |   🔴               | 2/3/4/8      |    🔴              |       🟢                      |    🟢                       |      https://github.com/intel/auto-round                                       |
 | [AWQ](./awq)                              | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4            | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
 | [bitsandbytes](./bitsandbytes)            | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🟢 | 4/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
 | [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
 | [EETQ](./eetq)                            | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8            | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | [FP-Quant](./fp_quant)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 4           | 🔴               | 🟢                          | 🟢                      | https://github.com/IST-DASLab/FP-Quant      |
-| [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
+| [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🟢              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
 | [GPTQModel](./gptq)                       | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
 | [AutoGPTQ](./gptq)                        | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
 | [HIGGS](./higgs)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4          | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
-| [HQQ](./hqq)                              | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8          | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [optimum-quanto](./quanto)                | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8        | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [HQQ](./hqq)                              | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🟢              | 🟢              | 1/8          | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto)                | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🟢              | 🟢              | 2/4/8        | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8)                | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8            | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao)                      | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 | 🔴              |                 | 4/8          |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [torchao](./torchao)                      | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 | 🟢              |                 | 4/8          |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
 | [VPTQ](./vptq)                            | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8          | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
-| [FINEGRAINED_FP8](./finegrained_fp8)      | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8            | 🔴               | 🟢                          | 🟢                      |        |
+| [FINEGRAINED_FP8](./finegrained_fp8)      | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🟢              | 🔴              | 8            | 🔴               | 🟢                          | 🟢                      |        |
 | [SpQR](./spqr)                            | 🔴                     |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3            |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
 | [Quark](./quark)                          | 🔴                     | 🟢 | 🟢      | 🟢      | 🟢                   | 🟢       | ?               | 2/4/6/8/9/16 | 🔴                | 🔴                               | 🟢                       | https://quark.docs.amd.com/latest/                      |

--- a/docs/source/ko/cache_explanation.md
+++ b/docs/source/ko/cache_explanation.md
@ -107,7 +107,7 @@ model_id = "meta-llama/Llama-2-7b-chat-hf"
 model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map=device)
 tokenizer = AutoTokenizer.from_pretrained(model_id)

-past_key_values = DynamicCache()
+past_key_values = DynamicCache(config=model.config)
 messages = [{"role": "user", "content": "Hello, what's your name."}]
 inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)

--- a/docs/source/ko/model_doc/biogpt.md
+++ b/docs/source/ko/model_doc/biogpt.md
@ -29,7 +29,7 @@ BioGPT는 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon,
 ## 사용 팁 [[usage-tips]]

 - BioGPT는 절대적 위치 임베딩(absolute position embedding)을 사용하므로, 입력을 왼쪽이 아닌 오른쪽에서 패딩하는 것이 권장됩니다.
- BioGPT는 인과적 언어 모델링(Casual Langague Modeling, CLM) 목표로 학습되었기 때문에, 다음 토큰을 예측하는 데 강력한 성능을 보입니다. 이 기능을 활용하여 BioGPT는 구문적으로 일관된 텍스트를 생성할 수 있으며, 예시 스크립트 `run_generation.py`에서 이를 확인할 수 있습니다.
+- BioGPT는 인과적 언어 모델링(Causal Langague Modeling, CLM) 목표로 학습되었기 때문에, 다음 토큰을 예측하는 데 강력한 성능을 보입니다. 이 기능을 활용하여 BioGPT는 구문적으로 일관된 텍스트를 생성할 수 있으며, 예시 스크립트 `run_generation.py`에서 이를 확인할 수 있습니다.
 - 이 모델은 `past_key_values`(PyTorch 용)를 입력으로 받을 수 있는데, 이는 이전에 계산된 키/값 어텐션 쌍입니다. 이 값을 사용하면 텍스트 생성 중 이미 계산된 값을 다시 계산하지 않도록 할 수 있습니다. PyTorch에서 `past_key_values` 인수는 BioGptForCausalLM.forward() 메소드에서 자세히 설명되어 있습니다.

 ### Scaled Dot Product Attention(SDPA) 사용 [[using-scaled-dot-product-attention-sdpa]]
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -60,7 +60,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/modular-transformers/modeling_dummy_bert.py
+++ b/examples/modular-transformers/modeling_dummy_bert.py
@ -541,7 +541,7 @@ class DummyBertEncoder(nn.Module):
                use_cache = False

        if use_cache and self.config.is_decoder and past_key_values is None:
-            past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))

        if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple):
            logger.warning_once(
--- a/examples/modular-transformers/modeling_roberta.py
+++ b/examples/modular-transformers/modeling_roberta.py
@ -544,7 +544,7 @@ class RobertaEncoder(nn.Module):
                use_cache = False

        if use_cache and self.config.is_decoder and past_key_values is None:
-            past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))

        if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple):
            logger.warning_once(
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "datasets[audio]>=1.14.0",
 #     "evaluate",
 #     "librosa",
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "torch>=1.5.0",
 #     "torchvision>=0.6.0",
 #     "datasets>=1.8.0",
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate>=0.12.0",
 #     "torch>=1.5.0",
 #     "torchvision>=0.6.0",
@ -68,7 +68,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate>=0.12.0",
 #     "torch>=1.5.0",
 #     "torchvision>=0.6.0",
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "torch>=1.5.0",
 #     "torchvision>=0.6.0",
 #     "datasets>=1.8.0",
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "torch>=1.5.0",
 #     "torchvision>=0.6.0",
 #     "datasets>=1.8.0",
@ -56,7 +56,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "torch>=1.5.0",
 #     "torchvision>=0.6.0",
 #     "datasets>=1.8.0",
@ -61,7 +61,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "timm",
 #     "datasets",
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "timm",
 #     "datasets",
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "accelerate >= 0.12.0",
 #     "torch >= 1.3",
@ -69,7 +69,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "accelerate >= 0.12.0",
 #     "torch >= 1.3",
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "accelerate >= 0.12.0",
 #     "torch >= 1.3",
@ -72,7 +72,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "accelerate >= 0.12.0",
 #     "torch >= 1.3",
@ -74,7 +74,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "accelerate >= 0.12.0",
 #     "torch >= 1.3",
@ -68,7 +68,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "accelerate >= 0.12.0",
 #     "torch >= 1.3",
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "accelerate >= 0.12.0",
 #     "torch >= 1.3",
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "sentencepiece != 0.1.92",
 #     "protobuf",
@ -57,7 +57,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "sentencepiece != 0.1.92",
 #     "protobuf",
@ -65,7 +65,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "timm",
 #     "datasets>=4.0",
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "albumentations >= 1.4.16",
 #     "timm",
 #     "datasets>=4.0",
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "datasets >= 2.0.0",
 #     "torch >= 1.3",
 #     "accelerate",
@ -62,7 +62,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "datasets >= 2.0.0",
 #     "torch >= 1.3",
 #     "accelerate",
@ -62,7 +62,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "datasets[audio] >= 1.12.0",
 #     "torch >= 1.5",
 #     "torchaudio",
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "datasets[audio] >= 1.18.0",
 #     "torch >= 1.5",
 #     "torchaudio",
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "datasets[audio] >= 1.18.0",
 #     "torch >= 1.5",
 #     "torchaudio",
@ -64,7 +64,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "datasets[audio] >= 1.18.0",
 #     "torch >= 1.5",
 #     "torchaudio",
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -67,7 +67,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -14,7 +14,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -16,7 +16,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -62,7 +62,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@ -16,7 +16,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.21.0",
 #     "sentencepiece != 0.1.92",
 #     "protobuf",
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "seqeval",
 #     "datasets >= 1.8.0",
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "seqeval",
 #     "datasets >= 1.8.0",
@ -67,7 +67,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -66,7 +66,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -15,7 +15,7 @@

 # /// script
 # dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
+#     "transformers==4.56.2",
 #     "accelerate >= 0.12.0",
 #     "datasets >= 1.8.0",
 #     "sentencepiece != 0.1.92",
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version(
    "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@ -49,7 +49,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@ -61,7 +61,7 @@ except (ModuleNotFoundError, ImportError):


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version

 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@ -46,7 +46,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 task_to_keys = {
    "cola": ("sentence", None),
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version

 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.56.0.dev0")
+check_min_version("4.56.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/setup.py
+++ b/setup.py
@ -189,7 +189,7 @@ _deps = [
    "timeout-decorator",
    "tiktoken",
    "timm<=1.0.19,!=1.0.18",
-    "tokenizers>=0.21,<0.22",
+    "tokenizers>=0.22.0,<=0.23.0",
    "torch>=2.2",
    "torchaudio",
    "torchvision",
@ -463,7 +463,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.56.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.56.2",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.56.0.dev0"
+__version__ = "4.56.2"

 from pathlib import Path
 from typing import TYPE_CHECKING
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@ -19,6 +19,7 @@ import torch
 from torch import Tensor, nn

 from .utils import logging
+from .utils.import_utils import is_torchdynamo_compiling


 logger = logging.get_logger(__name__)
@ -185,6 +186,100 @@ class ClassInstantier(OrderedDict):
        return cls(**kwargs)


+class XIELUActivation(nn.Module):
+    """
+    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010
+
+    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
+    Otherwise, we emit a single warning and use xIELU Python
+    """
+
+    def __init__(
+        self,
+        alpha_p_init=0.8,
+        alpha_n_init=0.8,
+        beta=0.5,
+        eps=-1e-6,
+        dtype=torch.bfloat16,
+        with_vector_loads=False,
+    ):
+        super().__init__()
+        self.alpha_p = nn.Parameter(torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - 1).unsqueeze(0))
+        self.alpha_n = nn.Parameter(
+            torch.log(torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - 1).unsqueeze(0)
+        )
+        self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
+        self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
+        self.with_vector_loads = with_vector_loads
+        # Temporary until xIELU CUDA fully implemented
+        self._beta_scalar = float(self.beta.detach().cpu().float().item())
+        self._eps_scalar = float(self.eps.detach().cpu().float().item())
+
+        self._xielu_cuda_obj = None
+        try:
+            import xielu.ops  # noqa: F401
+
+            self._xielu_cuda_obj = torch.classes.xielu.XIELU()
+            msg = "Using experimental xIELU CUDA."
+            try:
+                from torch._dynamo import allow_in_graph
+
+                self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
+                msg += " Enabled torch._dynamo for xIELU CUDA."
+            except Exception as err:
+                msg += f" Could not enable torch._dynamo for xIELU ({err}) - this may result in slower performance."
+                self._xielu_cuda_fn = self._xielu_cuda
+            logger.warning_once(msg)
+        except Exception as err:
+            logger.warning_once(
+                "CUDA-fused xIELU not available (%s) – falling back to a Python version.\n"
+                "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
+                str(err),
+            )
+
+    def _xielu_python(self, x: Tensor) -> Tensor:
+        alpha_p = nn.functional.softplus(self.alpha_p)
+        alpha_n = self.beta + nn.functional.softplus(self.alpha_n)
+        return torch.where(
+            x > 0,
+            alpha_p * x * x + self.beta * x,
+            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x,
+        )
+
+    def _xielu_cuda(self, x: Tensor) -> Tensor:
+        """Firewall function to prevent torch.compile from seeing .item() calls"""
+        original_shape = x.shape
+        # CUDA kernel expects 3D tensors, reshape if needed
+        while x.dim() < 3:
+            x = x.unsqueeze(0)
+        if x.dim() > 3:
+            x = x.view(-1, 1, x.size(-1))
+        if original_shape != x.shape:
+            logger.warning_once(
+                "Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).",
+                original_shape,
+                x.shape,
+            )
+        result = self._xielu_cuda_obj.forward(
+            x,
+            self.alpha_p,
+            self.alpha_n,
+            # Temporary until xIELU CUDA fully implemented -> self.{beta,eps}.item()
+            self._beta_scalar,
+            self._eps_scalar,
+            self.with_vector_loads,
+        )
+        return result.view(original_shape)
+
+    def forward(self, input: Tensor) -> Tensor:
+        if self._xielu_cuda_obj is not None and input.is_cuda:
+            if not is_torchdynamo_compiling():
+                return self._xielu_cuda_fn(input)
+            else:
+                logger.warning_once("torch._dynamo is compiling, using Python version of xIELU.")
+        return self._xielu_python(input)
+
+
 ACT2CLS = {
    "gelu": GELUActivation,
    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
@ -206,6 +301,7 @@ ACT2CLS = {
    "swish": nn.SiLU,
    "tanh": nn.Tanh,
    "prelu": nn.PReLU,
+    "xielu": XIELUActivation,
 }
 ACT2FN = ClassInstantier(ACT2CLS)

--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -17,6 +17,7 @@ and remove unnecessary dependencies.
 """

 import base64
+import importlib
 import io
 import os
 import warnings
@ -25,6 +26,7 @@ from typing import Any, Optional, Sequence, Union

 import numpy as np
 import requests
+from packaging import version

 from .utils import (
    is_librosa_available,
@ -46,8 +48,7 @@ if is_librosa_available():
    import soxr

 if is_torchcodec_available():
-    from torchcodec.decoders import AudioDecoder
-
+    TORCHCODEC_VERSION = version.parse(importlib.metadata.version("torchcodec"))

 AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]]  # noqa: F821

@ -71,8 +72,8 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)
    if isinstance(audio, str):
        # Try to load with `torchcodec` but do not enforce users to install it. If not found
        # fallback to `librosa`. If using an audio-only model, most probably `torchcodec` won't be
-        # needed.
-        if is_torchcodec_available():
+        # needed. Do not raise any errors if not installed or versions do not match
+        if is_torchcodec_available() and TORCHCODEC_VERSION >= version.parse("0.3.0"):
            audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate)
        else:
            audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout)
@ -99,7 +100,9 @@ def load_audio_torchcodec(audio: Union[str, np.ndarray], sampling_rate=16000) ->
    Returns:
        `np.ndarray`: A numpy array representing the audio.
    """
-    requires_backends(load_audio, ["torchcodec"])
+    # Lazy import so that issues in torchcodec compatibility don't crash the whole library
+    requires_backends(load_audio_torchcodec, ["torchcodec"])
+    from torchcodec.decoders import AudioDecoder

    # Set `num_channels` to `1` which is what most models expects and the default in librosa
    decoder = AudioDecoder(audio, sample_rate=sampling_rate, num_channels=1)
@ -123,7 +126,7 @@ def load_audio_librosa(audio: Union[str, np.ndarray], sampling_rate=16000, timeo
    Returns:
        `np.ndarray`: A numpy array representing the audio.
    """
-    requires_backends(load_audio, ["librosa"])
+    requires_backends(load_audio_librosa, ["librosa"])

    # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
    if audio.startswith("http://") or audio.startswith("https://"):
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -99,7 +99,7 @@ class DynamicLayer(CacheLayerMixin):
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
-        Update the key and value caches in-place, and return the necessary kes and value states.
+        Update the key and value caches in-place, and return the necessary keys and value states.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
@ -182,7 +182,7 @@ class DynamicSlidingWindowLayer(DynamicLayer):
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
-        Update the key and value caches in-place, and return the necessary kes and value states.
+        Update the key and value caches in-place, and return the necessary keys and value states.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
@ -303,7 +303,7 @@ class StaticLayer(CacheLayerMixin):
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
-        Update the key and value caches in-place, and return the necessary kes and value states.
+        Update the key and value caches in-place, and return the necessary keys and value states.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
@ -378,7 +378,7 @@ class SlidingWindowLayer(StaticLayer):
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
-        Update the key and value caches in-place, and return the necessary kes and value states.
+        Update the key and value caches in-place, and return the necessary keys and value states.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
@ -457,7 +457,7 @@ class ChunkedSlidingLayer(SlidingWindowLayer):
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
-        Update the key and value caches in-place, and return the necessary kes and value states.
+        Update the key and value caches in-place, and return the necessary keys and value states.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
@ -566,7 +566,7 @@ class QuantizedLayer(DynamicLayer):
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
-        Update the key and value caches in-place, and return the necessary kes and value states.
+        Update the key and value caches in-place, and return the necessary keys and value states.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
@ -996,7 +996,6 @@ class DynamicCache(Cache):
    >>> past_key_values = DynamicCache(config=model.config)
    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    >>> outputs.past_key_values # access cache filled with key/values from generation
-    DynamicCache()
    ```
    """

@ -1018,6 +1017,9 @@ class DynamicCache(Cache):
                    "sliding_attention" if sliding_window is not None else "full_attention"
                    for _ in range(config.num_hidden_layers)
                ]
+            # Some models have shared layers thus no cache is needed for them (e.g. Gemma3n)
+            if hasattr(config, "num_kv_shared_layers"):
+                layer_types = layer_types[: -config.num_kv_shared_layers]

            for layer_type in layer_types:
                if layer_type in ("sliding_attention", "chunked_attention"):
@ -1129,6 +1131,9 @@ class StaticCache(Cache):
                layer_types = ["chunked_attention" for _ in range(config.num_hidden_layers)]
            else:
                layer_types = ["full_attention" for _ in range(config.num_hidden_layers)]
+        # Some models have shared layers thus no cache is needed for them (e.g. Gemma3n)
+        if hasattr(config, "num_kv_shared_layers"):
+            layer_types = layer_types[: -config.num_kv_shared_layers]

        layers = []
        for layer_type in layer_types:
@ -1223,8 +1228,8 @@ class EncoderDecoderCache(Cache):
    >>> inputs = processor(audio=YOUR-AUDIO, return_tensors="pt")

    >>> # Prepare cache classes for encoder and decoder and pass it to model's forward
-    >>> self_attention_cache = DynamicCache()
-    >>> cross_attention_cache = DynamicCache()
+    >>> self_attention_cache = DynamicCache(config=self.config)
+    >>> cross_attention_cache = DynamicCache(config=self.config)
    >>> past_key_values = EncoderDecoderCache(self_attention_cache, cross_attention_cache)
    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    >>> outputs.past_key_values # access cache filled with key/values from generation
--- a/src/transformers/commands/chat.py
+++ b/src/transformers/commands/chat.py
@ -129,7 +129,6 @@ class RichInterface:
            text = ""
            async for token in await stream:
                outputs = token.choices[0].delta.content
-                request_id = token.id

                if not outputs:
                    continue
@ -168,7 +167,7 @@ class RichInterface:

        self._console.print()

-        return text, request_id
+        return text

    def input(self) -> str:
        """Gets user input from the console."""
@ -700,8 +699,6 @@ class ChatCommand(BaseTransformersCLICommand):
        interface.clear()
        chat = self.clear_chat_history(args.system_prompt)

-        request_id = None
-
        # Starts the session with a minimal help message at the top, so that a user doesn't get stuck
        interface.print_help(minimal=True)
        while True:
@ -733,13 +730,12 @@ class ChatCommand(BaseTransformersCLICommand):
                    chat,
                    stream=True,
                    extra_body={
-                        "request_id": request_id,
                        "generation_config": generation_config.to_json_string(),
                        "model": model,
                    },
                )

-                model_output, request_id = await interface.stream_output(stream)
+                model_output = await interface.stream_output(stream)

                chat.append({"role": "assistant", "content": model_output})

--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -1058,7 +1058,9 @@ class PretrainedConfig(PushToHubMixin):
        if d.get("dtype") is not None:
            if isinstance(d["dtype"], dict):
                d["dtype"] = {k: str(v).split(".")[-1] for k, v in d["dtype"].items()}
-            elif not isinstance(d["dtype"], str):
+            # models like Emu3 can have "dtype" as token in config's vocabulary map,
+            # so we also exclude int type here to avoid error in this special case.
+            elif not isinstance(d["dtype"], (str, int)):
                d["dtype"] = str(d["dtype"]).split(".")[1]
        for value in d.values():
            if isinstance(value, dict):
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -91,7 +91,7 @@ deps = {
    "timeout-decorator": "timeout-decorator",
    "tiktoken": "tiktoken",
    "timm": "timm<=1.0.19,!=1.0.18",
-    "tokenizers": "tokenizers>=0.21,<0.22",
+    "tokenizers": "tokenizers>=0.22.0,<=0.23.0",
    "torch": "torch>=2.2",
    "torchaudio": "torchaudio",
    "torchvision": "torchvision",
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@ -27,9 +27,9 @@ import numpy as np
 from .dynamic_module_utils import custom_object_save
 from .utils import (
    FEATURE_EXTRACTOR_NAME,
+    PROCESSOR_NAME,
    PushToHubMixin,
    TensorType,
-    cached_file,
    copy_func,
    download_url,
    is_flax_available,
@ -44,6 +44,7 @@ from .utils import (
    logging,
    requires_backends,
 )
+from .utils.hub import cached_file


 if TYPE_CHECKING:
@ -505,19 +506,28 @@ class FeatureExtractionMixin(PushToHubMixin):
            feature_extractor_file = FEATURE_EXTRACTOR_NAME
            try:
                # Load from local folder or from cache or download from model Hub and cache
-                resolved_feature_extractor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    feature_extractor_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    subfolder=subfolder,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                )
+                resolved_feature_extractor_files = [
+                    resolved_file
+                    for filename in [feature_extractor_file, PROCESSOR_NAME]
+                    if (
+                        resolved_file := cached_file(
+                            pretrained_model_name_or_path,
+                            filename=filename,
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            proxies=proxies,
+                            resume_download=resume_download,
+                            local_files_only=local_files_only,
+                            subfolder=subfolder,
+                            token=token,
+                            user_agent=user_agent,
+                            revision=revision,
+                            _raise_exceptions_for_missing_entries=False,
+                        )
+                    )
+                    is not None
+                ]
+                resolved_feature_extractor_file = resolved_feature_extractor_files[0]
            except OSError:
                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                # the original exception.
@ -536,6 +546,7 @@ class FeatureExtractionMixin(PushToHubMixin):
            with open(resolved_feature_extractor_file, encoding="utf-8") as reader:
                text = reader.read()
            feature_extractor_dict = json.loads(text)
+            feature_extractor_dict = feature_extractor_dict.get("feature_extractor", feature_extractor_dict)

        except json.JSONDecodeError:
            raise OSError(
--- a/src/transformers/generation/continuous_batching/classes.py
+++ b/src/transformers/generation/continuous_batching/classes.py
@ -191,7 +191,7 @@ class RequestState:
            f"query_length={len(self.prompt_ids)}",
            f"remaining_tokens={len(self.remaining_prompt_ids)}",
            f"kv_length={self.position_offset}",
-            f"full_prompt_lenght={len(self.full_prompt_ids)}",
+            f"full_prompt_length={len(self.full_prompt_ids)}",
            f"allocated_blocks={self.allocated_blocks}",
            f"generated_tokens={self.static_outputs}",
        ]
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -1998,7 +1998,7 @@ class GenerationMixin(ContinuousMixin):
            elif "dynamic" in generation_config.cache_implementation:
                model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs)

-        # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
+        # Use DynamicCache instance by default. This will avoid back and forth from legacy format that
        # keeps copying the cache thus using much more memory
        else:
            model_kwargs[cache_name] = (
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@ -26,14 +26,15 @@ from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .image_utils import is_valid_image, load_image
 from .utils import (
    IMAGE_PROCESSOR_NAME,
+    PROCESSOR_NAME,
    PushToHubMixin,
-    cached_file,
    copy_func,
    download_url,
    is_offline_mode,
    is_remote_url,
    logging,
 )
+from .utils.hub import cached_file


 ImageProcessorType = TypeVar("ImageProcessorType", bound="ImageProcessingMixin")
@ -329,19 +330,28 @@ class ImageProcessingMixin(PushToHubMixin):
            image_processor_file = image_processor_filename
            try:
                # Load from local folder or from cache or download from model Hub and cache
-                resolved_image_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    image_processor_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                )
+                resolved_image_processor_files = [
+                    resolved_file
+                    for filename in [image_processor_file, PROCESSOR_NAME]
+                    if (
+                        resolved_file := cached_file(
+                            pretrained_model_name_or_path,
+                            filename=filename,
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            proxies=proxies,
+                            resume_download=resume_download,
+                            local_files_only=local_files_only,
+                            token=token,
+                            user_agent=user_agent,
+                            revision=revision,
+                            subfolder=subfolder,
+                            _raise_exceptions_for_missing_entries=False,
+                        )
+                    )
+                    is not None
+                ]
+                resolved_image_processor_file = resolved_image_processor_files[0]
            except OSError:
                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                # the original exception.
@ -360,6 +370,7 @@ class ImageProcessingMixin(PushToHubMixin):
            with open(resolved_image_processor_file, encoding="utf-8") as reader:
                text = reader.read()
            image_processor_dict = json.loads(text)
+            image_processor_dict = image_processor_dict.get("image_processor", image_processor_dict)

        except json.JSONDecodeError:
            raise OSError(
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@ -854,7 +854,7 @@ class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
        head_dim = getattr(self.config, "head_dim", self.config.hidden_size // self.config.num_attention_heads)
        num_heads = getattr(self.config, "num_key_value_heads", self.config.num_attention_heads)
        self.static_cache.early_initialization(batch_size, num_heads, head_dim, torch.float32, model_device)
-        self.cache = EncoderDecoderCache(self.static_cache, DynamicCache())
+        self.cache = EncoderDecoderCache(self.static_cache, DynamicCache(config=self.config))

        register_dynamic_cache_export_support()

@ -1051,7 +1051,7 @@ def export_with_dynamic_cache(
            {
                "input_ids": example_input_ids,
                "attention_mask": example_attention_mask,
-                "past_key_values": DynamicCache(),
+                "past_key_values": DynamicCache(config=model.config),
                "use_cache": True,
            },
            strict=False,
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@ -11,7 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
+import re
+from functools import partial
+from typing import Optional, Union
+
+from ..modeling_flash_attention_utils import lazy_import_flash_attention
+from .flash_attention import flash_attention_forward


 try:
@ -19,12 +24,13 @@ try:
        Device,
        LayerRepository,
        Mode,
+        get_kernel,
        register_kernel_mapping,
        replace_kernel_forward_from_hub,
        use_kernel_forward_from_hub,
    )

-    _hub_kernels_available = True
+    _kernels_available = True

    _KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
        "MultiScaleDeformableAttention": {
@ -82,8 +88,9 @@ try:

    register_kernel_mapping(_KERNEL_MAPPING)

-
 except ImportError:
+    _kernels_available = False
+
    # Stub to make decorators int transformers work when `kernels`
    # is not installed.
    def use_kernel_forward_from_hub(*args, **kwargs):
@ -104,16 +111,66 @@ except ImportError:
    def register_kernel_mapping(*args, **kwargs):
        raise RuntimeError("register_kernel_mapping requires `kernels` to be installed. Run `pip install kernels`.")

-    _hub_kernels_available = False
+
+def is_kernel(attn_implementation: Optional[str]) -> bool:
+    """Check whether `attn_implementation` matches a kernel pattern from the hub."""
+    return (
+        attn_implementation is not None
+        and re.search(r"^[^/:]+/[^/:]+(?:@[^/:]+)?(?::[^/:]+)?$", attn_implementation) is not None
+    )


-def is_hub_kernels_available():
-    return _hub_kernels_available
+def load_and_register_kernel(attn_implementation: str) -> None:
+    """Load and register the kernel associated to `attn_implementation`."""
+    if not is_kernel(attn_implementation):
+        return
+    if not _kernels_available:
+        raise ImportError("`kernels` is not installed. Please install it with `pip install kernels`.")
+
+    # Need to be imported here as otherwise we have a circular import in `modeling_utils`
+    from ..masking_utils import ALL_MASK_ATTENTION_FUNCTIONS
+    from ..modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+    attention_wrapper = None
+    # FIXME: @ArthurZucker this is dirty, did not want to do a lof of extra work
+    actual_attn_name = attn_implementation
+    if "|" in attn_implementation:
+        attention_wrapper, actual_attn_name = attn_implementation.split("|")
+        # `transformers` has wrapper for sdpa, paged, flash, flex etc.
+        attention_wrapper = ALL_ATTENTION_FUNCTIONS.get(attention_wrapper)
+    # Extract repo_id and kernel_name from the string
+    if ":" in actual_attn_name:
+        repo_id, kernel_name = actual_attn_name.split(":")
+        kernel_name = kernel_name.strip()
+    else:
+        repo_id = actual_attn_name
+        kernel_name = None
+    repo_id = repo_id.strip()
+    # extract the rev after the @ if it exists
+    repo_id, _, rev = repo_id.partition("@")
+    repo_id = repo_id.strip()
+    rev = rev.strip() if rev else None
+
+    # Load the kernel from hub
+    try:
+        kernel = get_kernel(repo_id, revision=rev)
+    except Exception as e:
+        raise ValueError(f"An error occured while trying to load from '{repo_id}': {e}.")
+    # correctly wrap the kernel
+    if hasattr(kernel, "flash_attn_varlen_func"):
+        if attention_wrapper is None:
+            attention_wrapper = flash_attention_forward
+        kernel_function = partial(attention_wrapper, implementation=kernel)
+        lazy_import_flash_attention(kernel)
+    elif kernel_name is not None:
+        kernel_function = getattr(kernel, kernel_name)
+    # Register the kernel as a valid attention
+    ALL_ATTENTION_FUNCTIONS.register(attn_implementation, kernel_function)
+    ALL_MASK_ATTENTION_FUNCTIONS.register(attn_implementation, ALL_MASK_ATTENTION_FUNCTIONS["flash_attention_2"])


 __all__ = [
    "LayerRepository",
-    "is_hub_kernels_available",
    "use_kernel_forward_from_hub",
    "register_kernel_mapping",
    "replace_kernel_forward_from_hub",
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@ -126,10 +126,10 @@ def _lazy_define_process_function(flash_function):

 def lazy_import_flash_attention(implementation: Optional[str]):
    """
-    Lazy loading flash attention and returning the respective functions + flags back
+    Lazily import flash attention and return the respective functions + flags.

-    NOTE: For fullgraph, this needs to be called before compile while no fullgraph can
-          can work without preloading. See `_check_and_adjust_attn_implementation` in `modeling_utils`.
+    NOTE: For fullgraph, this needs to be called before compile, while no fullgraph can
+    work without preloading. See `load_and_register_kernel` in `integrations.hub_kernels`.
    """
    global _flash_fn, _flash_varlen_fn, _pad_fn, _unpad_fn
    if any(k is None for k in [_flash_fn, _flash_varlen_fn, _pad_fn, _unpad_fn]):
@ -313,17 +313,13 @@ def _upad_input(
    )


-def prepare_fa_kwargs_from_position_ids(position_ids, is_packed_sequence: bool = True):
+def prepare_fa_kwargs_from_position_ids(position_ids):
    """
-    This function returns all the necessary kwargs to call `flash_attn_varlen_func`
-    extracted from position_ids. The `position_ids` can be either packed sequence or
-    the usual padded position ids, for example in inference time.
+    This function returns all the necessary kwargs to call `flash_attn_varlen_func` extracted from position_ids.

    Arguments:
        position_ids (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
-        is_packed_sequence (`bool`, *optional*, defaults to `True`):
-            Whether the input position ids are a packed sequence or not.

    Return:
        (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
@ -333,52 +329,35 @@ def prepare_fa_kwargs_from_position_ids(position_ids, is_packed_sequence: bool =
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query,
            `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    """
-    # If the lengths are not equal, most probably we are in decoding stage with cache
-    # In that case the position ids will not always start with `0` and we need a better way to infer
-    # cumulative seq lengths.
    tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}
-    if not is_packed_sequence:
-        last_position_ids = position_ids[:, -1]
-        q_len = (
-            torch.ones(position_ids.size(0), **tensor_kwargs)
-            if position_ids.shape[-1] == 1
-            else last_position_ids.add(1)
-        )
-        cu_seq_lens_q = torch.cat([torch.zeros(1, **tensor_kwargs), q_len.cumsum(0).to(torch.int32)], 0)
-        cu_seq_lens_k = torch.cat(
-            [torch.zeros(1, **tensor_kwargs), last_position_ids.add(1).cumsum(0).to(torch.int32)], 0
-        )

-        max_length_q = int(q_len.max())
-        max_length_k = int(last_position_ids.max()) + 1
-    else:
-        position_ids = position_ids.view(-1)
-        indices_q = (position_ids == 0).nonzero().view(-1)
+    position_ids = position_ids.view(-1)
+    indices_q = (position_ids == 0).nonzero().view(-1)

-        cu_seq_lens_q = torch.cat(
-            (
-                indices_q.to(**tensor_kwargs),
-                torch.tensor(position_ids.size(), **tensor_kwargs),
-            )
+    cu_seq_lens_q = torch.cat(
+        (
+            indices_q.to(**tensor_kwargs),
+            torch.tensor(position_ids.size(), **tensor_kwargs),
        )
-        cu_seq_lens_k = cu_seq_lens_q
+    )
+    cu_seq_lens_k = cu_seq_lens_q

-        # https://github.com/Dao-AILab/flash-attention/blob/2dd8078adc1d9b74e315ee99718c0dea0de8eeb6/flash_attn/flash_attn_interface.py#L1423-L1424
-        # We should use cu_seq_lens instead of position_ids to get the max length since position_ids is not always increasing
-        # for some models (e.g. qwen2-vl).
-        max_length_q = cu_seq_lens_q.diff().max()
-        # NOTE: With torch compile, this will cause a graph break if you don't set
-        # `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` in the environment or call
-        # `torch._dynamo.config.capture_scalar_outputs = True` before doing the forward pass.
-        # This is a limitation of flash attention API, as the function `flash_attn_varlen_func`
-        # requires `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
-        max_length_q = max_length_q.item()
-        max_length_k = max_length_q
+    # https://github.com/Dao-AILab/flash-attention/blob/2dd8078adc1d9b74e315ee99718c0dea0de8eeb6/flash_attn/flash_attn_interface.py#L1423-L1424
+    # We should use cu_seq_lens instead of position_ids to get the max length since position_ids is not always increasing
+    # for some models (e.g. qwen2-vl).
+    max_length_q = cu_seq_lens_q.diff().max()
+    # NOTE: With torch compile, this will cause a graph break if you don't set
+    # `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` in the environment or call
+    # `torch._dynamo.config.capture_scalar_outputs = True` before doing the forward pass.
+    # This is a limitation of flash attention API, as the function `flash_attn_varlen_func`
+    # requires `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
+    max_length_q = max_length_q.item()
+    max_length_k = max_length_q

    return (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k)


-def _prepare_from_posids(query, key, value, position_ids, query_length):
+def _prepare_from_posids(query, key, value, position_ids):
    """
    This function returns necessary arguments to call `flash_attn_varlen_func`.
    All three query, key, value states will be flattened.
@ -394,8 +373,6 @@ def _prepare_from_posids(query, key, value, position_ids, query_length):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        position_ids (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
-        query_length (`int`):
-            Sequence length of the input queries.

    Return:
        query (`torch.Tensor`):
@ -409,16 +386,11 @@ def _prepare_from_posids(query, key, value, position_ids, query_length):
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    """
-    kv_length = key.shape[1]
-    is_packed_sequence = query_length == kv_length
-
    query = query.contiguous().view(-1, query.size(-2), query.size(-1))
    key = key.contiguous().view(-1, key.size(-2), key.size(-1))
    value = value.contiguous().view(-1, value.size(-2), value.size(-1))

-    (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k) = prepare_fa_kwargs_from_position_ids(
-        position_ids, is_packed_sequence=is_packed_sequence
-    )
+    (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k) = prepare_fa_kwargs_from_position_ids(position_ids)

    return (query, key, value, (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k))

@ -660,7 +632,7 @@ def _flash_attention_forward(
    elif is_fa_with_varlen_kwargs or is_fa_with_position_ids:
        if cu_seq_lens_q is None or cu_seq_lens_k is None:
            q, k, v, (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k) = _prepare_from_posids(
-                query_states, key_states, value_states, position_ids, query_length=query_length
+                query_states, key_states, value_states, position_ids
            )
        else:
            q = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -44,12 +44,6 @@ from torch import Tensor, nn
 from torch.distributions import constraints
 from torch.utils.checkpoint import checkpoint

-from transformers.utils import is_torchao_available
-
-
-if is_torchao_available():
-    from torchao.quantization import Int4WeightOnlyConfig
-
 from .configuration_utils import PretrainedConfig
 from .distributed import DistributedConfig
 from .dynamic_module_utils import custom_object_save
@ -61,6 +55,7 @@ from .integrations.eager_paged import eager_paged_attention_forward
 from .integrations.flash_attention import flash_attention_forward
 from .integrations.flash_paged import paged_attention_forward
 from .integrations.flex_attention import flex_attention_forward
+from .integrations.hub_kernels import is_kernel, load_and_register_kernel
 from .integrations.sdpa_attention import sdpa_attention_forward
 from .integrations.sdpa_paged import sdpa_attention_paged_forward
 from .integrations.tensor_parallel import (
@ -73,17 +68,8 @@ from .integrations.tensor_parallel import (
    verify_tp_plan,
 )
 from .loss.loss_utils import LOSS_MAPPING
-from .masking_utils import ALL_MASK_ATTENTION_FUNCTIONS
 from .modeling_flash_attention_utils import lazy_import_flash_attention
-from .pytorch_utils import (  # noqa: F401
-    Conv1D,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    id_tensor_storage,
-    prune_conv1d_layer,
-    prune_layer,
-    prune_linear_layer,
-)
+from .pytorch_utils import id_tensor_storage
 from .quantizers import HfQuantizer
 from .quantizers.auto import get_hf_quantizer
 from .quantizers.quantizers_utils import get_module_from_name
@ -124,6 +110,7 @@ from .utils import (
    is_torch_npu_available,
    is_torch_xla_available,
    is_torch_xpu_available,
+    is_torchao_available,
    logging,
 )
 from .utils.generic import _CAN_RECORD_REGISTRY, GeneralInterface, OutputRecorder
@ -138,9 +125,8 @@ from .utils.import_utils import (
 from .utils.quantization_config import BitsAndBytesConfig, QuantizationMethod


-XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
-XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
-
+if is_torchao_available():
+    from torchao.quantization import Int4WeightOnlyConfig

 if is_accelerate_available():
    from accelerate import dispatch_model, infer_auto_device_map
@ -164,32 +150,14 @@ if is_safetensors_available():
    from safetensors.torch import load_file as safe_load_file
    from safetensors.torch import save_file as safe_save_file

+if is_peft_available():
+    from .utils import find_adapter_config_file

-if is_kernels_available():
-    from kernels import get_kernel
-
-
-logger = logging.get_logger(__name__)
-
-
-_init_weights = True
-_is_quantized = False
-_is_ds_init_called = False
 _torch_distributed_available = torch.distributed.is_available()
-
 _is_dtensor_available = _torch_distributed_available and is_torch_greater_or_equal("2.5")
 if _is_dtensor_available:
    from torch.distributed.tensor import DTensor

-
-def is_local_dist_rank_0():
-    return (
-        torch.distributed.is_available()
-        and torch.distributed.is_initialized()
-        and int(os.environ.get("LOCAL_RANK", "-1")) == 0
-    )
-
-
 if is_sagemaker_mp_enabled():
    import smdistributed.modelparallel.torch as smp
    from smdistributed.modelparallel import __version__ as SMP_VERSION
@ -198,11 +166,24 @@ if is_sagemaker_mp_enabled():
 else:
    IS_SAGEMAKER_MP_POST_1_10 = False

-if is_peft_available():
-    from .utils import find_adapter_config_file

+logger = logging.get_logger(__name__)

+XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
+XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
 SpecificPreTrainedModelType = TypeVar("SpecificPreTrainedModelType", bound="PreTrainedModel")
+_init_weights = True
+_is_quantized = False
+_is_ds_init_called = False
+
+
+def is_local_dist_rank_0():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and int(os.environ.get("LOCAL_RANK", "-1")) == 0
+    )
+

 TORCH_INIT_FUNCTIONS = {
    "uniform_": nn.init.uniform_,
@ -2792,61 +2773,45 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
            `str`: The final attention implementation to use, including potential fallbacks from sdpa to eager, or from
            None to sdpa (to potentially eager).
        """
-        # Register kernel if relevant
-        if attn_implementation is not None and re.match(
-            r"^[^/:]+/[^/:]+(?:@[^/:]+)?(?::[^/:]+)?$", attn_implementation
+        applicable_attn_implementation = attn_implementation
+        # If FA not installed, do not fail but use kernels instead
+        if (
+            applicable_attn_implementation == "flash_attention_2"
+            and self._supports_flash_attn
+            and not is_flash_attn_2_available()
+            and is_kernels_available()
        ):
-            if not is_kernels_available():
-                raise ValueError("kernels is not installed. Please install it with `pip install kernels`.")
-            attention_wrapper = None
-            # FIXME: @ArthurZucker this is dirty, did not want to do a lof of extra work
-            actual_attn_name = attn_implementation
-            if "|" in attn_implementation:
-                attention_wrapper, actual_attn_name = attn_implementation.split("|")
-                # `transformers` has wrapper for sdpa, paged, flash, flex etc.
-                attention_wrapper = ALL_ATTENTION_FUNCTIONS.get(attention_wrapper)
-            # Extract repo_id and kernel_name from the string
-            if ":" in actual_attn_name:
-                repo_id, kernel_name = actual_attn_name.split(":")
-                kernel_name = kernel_name.strip()
-            else:
-                repo_id = actual_attn_name
-                kernel_name = None
-            repo_id = repo_id.strip()
-            # extract the rev after the @ if it exists
-            repo_id, _, rev = repo_id.partition("@")
-            repo_id = repo_id.strip()
-            rev = rev.strip() if rev else None
+            applicable_attn_implementation = "kernels-community/flash-attn"
+        if is_kernel(applicable_attn_implementation):
            try:
-                kernel = get_kernel(repo_id, revision=rev)
-                if hasattr(kernel, "flash_attn_varlen_func"):
-                    if attention_wrapper is None:
-                        attention_wrapper = flash_attention_forward
-                    kernel_function = partial(attention_wrapper, implementation=kernel)
-                    lazy_import_flash_attention(kernel)
-                elif kernel_name is not None:
-                    kernel_function = getattr(kernel, kernel_name)
-                ALL_ATTENTION_FUNCTIONS.register(attn_implementation, kernel_function)
-                ALL_MASK_ATTENTION_FUNCTIONS.register(
-                    attn_implementation, ALL_MASK_ATTENTION_FUNCTIONS["flash_attention_2"]
-                )
+                load_and_register_kernel(applicable_attn_implementation)
+                # log that we used kernel fallback if successful
+                if attn_implementation == "flash_attention_2":
+                    logger.warning_once(
+                        "You do not have `flash_attn` installed, using `kernels-community/flash-attn` from the `kernels` "
+                        "library instead!"
+                    )
            except Exception as e:
+                if attn_implementation == "flash_attention_2":
+                    self._flash_attn_2_can_dispatch()  # will fail as fa2 is not available but raise the proper exception
                logger.warning_once(
-                    f"Could not find a kernel repository '{repo_id}' compatible with your device in the hub: {e}. Using "
-                    "default attention implementation instead (sdpa if available, eager otherwise)."
+                    f"Could not find a kernel matching `{applicable_attn_implementation}` compatible with your device in the "
+                    f"hub:\n{e}.\nUsing default attention implementation instead (sdpa if available, eager otherwise)."
                )
                try:
                    self._sdpa_can_dispatch(is_init_check)
-                    attn_implementation = "sdpa"
+                    applicable_attn_implementation = "sdpa"
                except (ValueError, ImportError) as e:
-                    attn_implementation = "eager"
+                    applicable_attn_implementation = "eager"
        else:
-            attn_implementation = self.get_correct_attn_implementation(attn_implementation, is_init_check)
+            applicable_attn_implementation = self.get_correct_attn_implementation(
+                applicable_attn_implementation, is_init_check
+            )
            # preload flash attention here to allow compile with fullgraph
-            if attn_implementation.startswith("flash_attention"):
-                lazy_import_flash_attention(attn_implementation)
+            if applicable_attn_implementation.startswith("flash_attention"):
+                lazy_import_flash_attention(applicable_attn_implementation)

-        return attn_implementation
+        return applicable_attn_implementation

    def get_correct_attn_implementation(self, requested_attention: Optional[str], is_init_check: bool = False) -> str:
        applicable_attention = "sdpa" if requested_attention is None else requested_attention
@ -3035,11 +3000,14 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH

        if hasattr(self, "model"):
            inner = self.model
-            if hasattr(inner, "get_decoder"):
+            # See: https://github.com/huggingface/transformers/issues/40815
+            if hasattr(inner, "get_decoder") and type(inner) is not type(self):
                return inner.get_decoder()
            return inner

-        return None  # raise AttributeError(f"{self.__class__.__name__} has no decoder; override `get_decoder()` if needed.")
+        # If this is a base transformer model (no decoder/model attributes), return self
+        # This handles cases like MistralModel which is itself the decoder
+        return self

    def set_decoder(self, decoder):
        """
@ -3058,7 +3026,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                self.model = decoder
            return

-        return  # raise AttributeError(f"{self.__class__.__name__} cannot accept a decoder; override `set_decoder()`.")
+        return

    def _init_weights(self, module):
        """
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@ -1,269 +0,0 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import gc
-import os
-import re
-from typing import Optional
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    Aimv2Config,
-    Aimv2Model,
-    Aimv2VisionConfig,
-    Aimv2VisionModel,
-    AutoImageProcessor,
-    AutoProcessor,
-)
-
-
-ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL = {
-    # Embeddings
-    r"preprocessor.patchifier.proj": r"embeddings.patch_embed",
-    r"preprocessor.pos_embed": r"embeddings.position_embedding.weight",
-    r"preprocessor.patchifier.norm.weight": r"embeddings.rms_norm.weight",
-    # Encoder Layers
-    r"trunk.blocks.(\d+).attn.qkv": r"encoder.layers.\1.attention.qkv",
-    r"trunk.blocks.(\d+).attn.proj": r"encoder.layers.\1.attention.out_proj",
-    r"trunk.blocks.(\d+).mlp.fc1": r"encoder.layers.\1.ffn.gate_proj",
-    r"trunk.blocks.(\d+).mlp.fc2": r"encoder.layers.\1.ffn.down_proj",
-    r"trunk.blocks.(\d+).mlp.fc3": r"encoder.layers.\1.ffn.up_proj",
-    # Normalization Layers
-    r"trunk.blocks.(\d+).norm_1": r"encoder.layers.\1.rms_norm1",
-    r"trunk.blocks.(\d+).norm_2": r"encoder.layers.\1.rms_norm2",
-    # Final Norm
-    r"trunk.post_trunk_norm": r"rms_norm",
-}
-
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    # Vision Embeddings
-    r"image_encoder.preprocessor.patchifier.proj": r"vision_model.embeddings.patch_embed",
-    r"image_encoder.preprocessor.pos_embed": r"vision_model.embeddings.position_embedding.weight",
-    r"image_encoder.preprocessor.patchifier.norm.weight": r"vision_model.embeddings.rms_norm.weight",
-    # Vision Encoder Layers
-    r"image_encoder.trunk.blocks.(\d+).attn.qkv": r"vision_model.encoder.layers.\1.attention.qkv",
-    r"image_encoder.trunk.blocks.(\d+).attn.proj": r"vision_model.encoder.layers.\1.attention.out_proj",
-    r"image_encoder.trunk.blocks.(\d+).mlp.fc1": r"vision_model.encoder.layers.\1.ffn.gate_proj",
-    r"image_encoder.trunk.blocks.(\d+).mlp.fc2": r"vision_model.encoder.layers.\1.ffn.down_proj",
-    r"image_encoder.trunk.blocks.(\d+).mlp.fc3": r"vision_model.encoder.layers.\1.ffn.up_proj",
-    # Normalization Layers
-    r"image_encoder.trunk.blocks.(\d+).norm_1": r"vision_model.encoder.layers.\1.rms_norm1",
-    r"image_encoder.trunk.blocks.(\d+).norm_2": r"vision_model.encoder.layers.\1.rms_norm2",
-    r"image_encoder.trunk.post_trunk_norm": r"vision_model.rms_norm",
-    r"image_projector": r"visual_projection",
-    # Vision Head
-    r"image_encoder.head.cls_token": r"vision_model.head.cls_token",
-    r"image_encoder.head.k": r"vision_model.head.k_proj",
-    r"image_encoder.head.v": r"vision_model.head.v_proj",
-    r"image_encoder.head.linear": r"vision_model.head.output_proj",
-    # Text Embeddings
-    r"text_encoder.preprocessor.text_embedding.weight": r"text_model.embeddings.token_embedding.weight",
-    r"text_encoder.preprocessor.positional_embedding": r"text_model.embeddings.position_embedding.weight",
-    # Text Encoder Layers
-    r"text_encoder.trunk.blocks.(\d+).attn.qkv": r"text_model.encoder.layers.\1.attention.qkv",
-    r"text_encoder.trunk.blocks.(\d+).attn.proj": r"text_model.encoder.layers.\1.attention.out_proj",
-    r"text_encoder.trunk.blocks.(\d+).mlp.fc1": r"text_model.encoder.layers.\1.ffn.gate_proj",
-    r"text_encoder.trunk.blocks.(\d+).mlp.fc2": r"text_model.encoder.layers.\1.ffn.down_proj",
-    r"text_encoder.trunk.blocks.(\d+).mlp.fc3": r"text_model.encoder.layers.\1.ffn.up_proj",
-    # Text Normalization Layers
-    r"text_encoder.trunk.blocks.(\d+).norm_1": r"text_model.encoder.layers.\1.rms_norm1",
-    r"text_encoder.trunk.blocks.(\d+).norm_2": r"text_model.encoder.layers.\1.rms_norm2",
-    r"text_encoder.trunk.post_trunk_norm": r"text_model.rms_norm",
-    r"text_projector": r"text_projection",
-    r"log_logit_scale": r"logit_scale",
-}
-
-
-def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> dict[str, torch.Tensor]:
-    # Download only the model.safetensors file
-    directory_path = snapshot_download(
-        repo_id=model_id,
-        revision=revision,
-        allow_patterns=["model.safetensors"],
-    )
-
-    original_state_dict = {}
-    safetensor_path = f"{directory_path}/model.safetensors"
-
-    with safe_open(safetensor_path, framework="pt", device="cpu") as f:
-        for key in f.keys():
-            original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict, ORIGINAL_TO_CONVERTED_KEY_MAPPING: dict):
-    """Converts state dict keys from the old format to the new format."""
-
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-def split_qkv_tensor(key, tensor):
-    """Splits a qkv tensor into separate q, k, v tensors and updates the key accordingly."""
-
-    new_keys = ["q_proj", "k_proj", "v_proj"]
-    split_size = tensor.shape[0] // 3
-    split_tensors = torch.split(tensor, split_size, dim=0)
-
-    return {key.replace("qkv", new_key): split_tensors[i] for i, new_key in enumerate(new_keys)}
-
-
-def get_model_config_mapping(model_id: str):
-    """Determines the correct model, config, and key mappings based on the checkpoint name."""
-
-    if model_id == "apple/aimv2-large-patch14-224-lit":
-        return Aimv2Model, Aimv2Config, ORIGINAL_TO_CONVERTED_KEY_MAPPING
-    else:
-        return Aimv2VisionModel, Aimv2VisionConfig, ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL
-
-
-def write_model(
-    hf_repo_id: str,
-    output_dir: str,
-    safe_serialization: bool = True,
-):
-    """
-    Converts a model checkpoint to Hugging Face format and saves it.
-
-    Args:
-        hf_repo_id (str): The Hugging Face repo ID to load from.
-        output_dir (str): The directory to save the converted model.
-        safe_serialization (bool): Whether to use safe serialization.
-
-    Returns:
-        model: The reloaded Hugging Face model.
-    """
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Get the appropriate model, config, and key mapping
-    model_class, config_class, key_mapping = get_model_config_mapping(hf_repo_id)
-
-    # Load config and original state dict
-    config = config_class.from_pretrained(hf_repo_id)
-
-    # Checkpoint `apple/aimv2-large-patch14-224-lit` uses AttentionPoolingHead hence set the required attr in config.
-    if hf_repo_id != "apple/aimv2-large-patch14-224-lit":
-        config.use_head = False
-
-    if hf_repo_id == "apple/aimv2-large-patch14-native":
-        config.is_native = True
-
-    original_state_dict = load_original_state_dict(hf_repo_id)
-
-    print("Converting model...")
-
-    state_dict = {}
-    result = convert_old_keys_to_new_keys(original_state_dict, key_mapping)
-    all_keys = list(original_state_dict.keys())
-
-    for key in all_keys:
-        value = original_state_dict[key]
-        new_key = result.pop(key)
-
-        if "qkv" in new_key:
-            qkv_state_dict = split_qkv_tensor(new_key, value)
-            state_dict.update(qkv_state_dict)
-        else:
-            state_dict[new_key] = value
-
-        # Check if position embeddings exist before squeezing
-        if new_key.endswith("position_embedding.weight"):
-            state_dict[new_key] = value.squeeze(0)
-
-    print(f"Loading the checkpoint in a {model_class.__name__}.")
-    model = model_class(config)
-    model.load_state_dict(state_dict, strict=True, assign=True)
-    print("Checkpoint loaded successfully.")
-
-    print("Saving the model.")
-    model.save_pretrained(output_dir, safe_serialization=safe_serialization)
-    del state_dict, model
-    gc.collect()
-
-    print("Reloading the model to check if it's saved correctly.")
-    model = model_class.from_pretrained(output_dir, device_map="auto")
-    print("Model reloaded successfully.")
-    return model
-
-
-def write_image_processor(hf_repo_id: str, output_dir: str):
-    if hf_repo_id == "apple/aimv2-large-patch14-224-lit":
-        image_processor = AutoProcessor.from_pretrained(hf_repo_id, use_fast=True)
-    else:
-        image_processor = AutoImageProcessor.from_pretrained(hf_repo_id, use_fast=True)
-    image_processor.save_pretrained(output_dir)
-    return image_processor
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--hf_repo_id",
-        default="apple/aimv2-large-patch14-224",
-        help="Location of official weights from apple on HF",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="aimv2_model",
-        help="Location to write the converted model and processor",
-    )
-    parser.add_argument(
-        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action=argparse.BooleanOptionalAction,
-        help="Whether or not to push the converted model to the huggingface hub.",
-    )
-    parser.add_argument(
-        "--hub_repo_id",
-        default=None,
-        help="Huggingface hub repo to write the converted model and processor",
-    )
-    args = parser.parse_args()
-
-    model = write_model(
-        hf_repo_id=args.hf_repo_id,
-        output_dir=args.output_dir,
-        safe_serialization=args.safe_serialization,
-    )
-
-    image_processor = write_image_processor(
-        hf_repo_id=args.hf_repo_id,
-        output_dir=args.output_dir,
-    )
-
-    if args.push_to_hub:
-        print("Pushing to hub...")
-        model.push_to_hub(args.hub_repo_id)
-        image_processor.push_to_hub(args.hub_repo_id)
-
-
-if __name__ == "__main__":
-    main()
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@ -613,11 +613,11 @@ class Aimv2TextModel(Aimv2PreTrainedModel):


@auto_docstring
-class Aimv2Model(CLIPModel, nn.Module):
+class Aimv2Model(CLIPModel):
    _supports_flash_attn = True

    def __init__(self, config: Aimv2Config):
-        nn.Module().__init__(config)
+        PreTrainedModel.__init__(self, config)

        self.projection_dim = config.projection_dim
        self.vision_embed_dim = config.vision_config.hidden_size
--- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ALBERT checkpoint."""
-
-import argparse
-
-import torch
-
-from ...utils import logging
-from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = AlbertConfig.from_json_file(albert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = AlbertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--albert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained ALBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
--- a/src/transformers/models/align/convert_align_tf_to_hf.py
+++ b/src/transformers/models/align/convert_align_tf_to_hf.py
@ -1,389 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ALIGN checkpoints from the original repository."""
-
-import argparse
-import os
-
-import align
-import numpy as np
-import requests
-import tensorflow as tf
-import torch
-from PIL import Image
-from tokenizer import Tokenizer
-
-from transformers import (
-    AlignConfig,
-    AlignModel,
-    AlignProcessor,
-    BertConfig,
-    BertTokenizer,
-    EfficientNetConfig,
-    EfficientNetImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def preprocess(image):
-    image = tf.image.resize(image, (346, 346))
-    image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289)
-    return image
-
-
-def get_align_config():
-    vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7")
-    vision_config.image_size = 289
-    vision_config.hidden_dim = 640
-    vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"}
-    vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1}
-    vision_config.depthwise_padding = []
-
-    text_config = BertConfig()
-    config = AlignConfig.from_text_vision_configs(
-        text_config=text_config, vision_config=vision_config, projection_dim=640
-    )
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_processor():
-    image_processor = EfficientNetImageProcessor(
-        do_center_crop=True,
-        rescale_factor=1 / 127.5,
-        rescale_offset=True,
-        do_normalize=False,
-        include_top=False,
-        resample=Image.BILINEAR,
-    )
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-    tokenizer.model_max_length = 64
-    processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer)
-    return processor
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def rename_keys(original_param_names):
-    # EfficientNet image encoder
-    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
-    block_names = list(set(block_names))
-    block_names = sorted(block_names)
-    num_blocks = len(block_names)
-    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
-
-    rename_keys = []
-    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
-    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
-    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
-    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
-    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
-
-    for b in block_names:
-        hf_b = block_name_mapping[b]
-        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
-        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
-        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
-        )
-        rename_keys.append(
-            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
-        )
-        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
-        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
-        rename_keys.append(
-            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
-        )
-
-        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
-        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
-        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
-        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
-        rename_keys.append(
-            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
-        )
-        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
-        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
-        )
-
-    key_mapping = {}
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = "vision_model." + item[1]
-
-    # BERT text encoder
-    rename_keys = []
-    old = "tf_bert_model/bert"
-    new = "text_model"
-    for i in range(12):
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.query.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/query/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.query.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.key.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/key/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.key.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.value.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/value/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.value.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.output.dense.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0",
-                f"{new}.encoder.layer.{i}.attention.output.dense.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0",
-                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0",
-                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0",
-                f"{new}.encoder.layer.{i}.intermediate.dense.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0",
-                f"{new}.encoder.layer.{i}.intermediate.dense.bias",
-            )
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias")
-        )
-
-    rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight"))
-    rename_keys.append(
-        (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight")
-    )
-    rename_keys.append(
-        (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight")
-    )
-    rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight"))
-    rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias"))
-
-    rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight"))
-    rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias"))
-    rename_keys.append(("dense/kernel:0", "text_projection.weight"))
-    rename_keys.append(("dense/bias:0", "text_projection.bias"))
-    rename_keys.append(("dense/bias:0", "text_projection.bias"))
-    rename_keys.append(("temperature:0", "temperature"))
-
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = item[1]
-    return key_mapping
-
-
-def replace_params(hf_params, tf_params, key_mapping):
-    list(hf_params.keys())
-
-    for key, value in tf_params.items():
-        if key not in key_mapping:
-            continue
-
-        hf_key = key_mapping[key]
-        if "_conv" in key and "kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
-        elif "embeddings" in key:
-            new_hf_value = torch.from_numpy(value)
-        elif "depthwise_kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
-        elif "kernel" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value))
-        elif "temperature" in key:
-            new_hf_value = value
-        elif "bn/gamma" in key or "bn/beta" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value)).squeeze()
-        else:
-            new_hf_value = torch.from_numpy(value)
-
-        # Replace HF parameters with original TF model parameters
-        hf_params[hf_key].copy_(new_hf_value)
-
-
-@torch.no_grad()
-def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our ALIGN structure.
-    """
-    # Load original model
-    seq_length = 64
-    tok = Tokenizer(seq_length)
-    original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size())
-    original_model.compile()
-    original_model.load_weights(checkpoint_path)
-
-    tf_params = original_model.trainable_variables
-    tf_non_train_params = original_model.non_trainable_variables
-    tf_params = {param.name: param.numpy() for param in tf_params}
-    for param in tf_non_train_params:
-        tf_params[param.name] = param.numpy()
-    tf_param_names = list(tf_params.keys())
-
-    # Load HuggingFace model
-    config = get_align_config()
-    hf_model = AlignModel(config).eval()
-    hf_params = hf_model.state_dict()
-
-    # Create src-to-dst parameter name mapping dictionary
-    print("Converting parameters...")
-    key_mapping = rename_keys(tf_param_names)
-    replace_params(hf_params, tf_params, key_mapping)
-
-    # Initialize processor
-    processor = get_processor()
-    inputs = processor(
-        images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt"
-    )
-
-    # HF model inference
-    hf_model.eval()
-    with torch.no_grad():
-        outputs = hf_model(**inputs)
-
-    hf_image_features = outputs.image_embeds.detach().numpy()
-    hf_text_features = outputs.text_embeds.detach().numpy()
-
-    # Original model inference
-    original_model.trainable = False
-    tf_image_processor = EfficientNetImageProcessor(
-        do_center_crop=True,
-        do_rescale=False,
-        do_normalize=False,
-        include_top=False,
-        resample=Image.BILINEAR,
-    )
-    image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"]
-    text = tok(tf.constant(["A picture of a cat"]))
-
-    image_features = original_model.image_encoder(image, training=False)
-    text_features = original_model.text_encoder(text, training=False)
-
-    image_features = tf.nn.l2_normalize(image_features, axis=-1)
-    text_features = tf.nn.l2_normalize(text_features, axis=-1)
-
-    # Check whether original and HF model outputs match  -> np.allclose
-    if not np.allclose(image_features, hf_image_features, atol=1e-3):
-        raise ValueError("The predicted image features are not the same.")
-    if not np.allclose(text_features, hf_text_features, atol=1e-3):
-        raise ValueError("The predicted text features are not the same.")
-    print("Model outputs match!")
-
-    if save_model:
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and image processor
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model and image processor to hub
-        print("Pushing converted ALIGN to the hub...")
-        processor.push_to_hub("align-base")
-        hf_model.push_to_hub("align-base")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_path",
-        default="./weights/model-weights",
-        type=str,
-        help="Path to the pretrained TF ALIGN checkpoint.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="hf_model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-
-    args = parser.parse_args()
-    convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
--- a/src/transformers/models/apertus/init.py
+++ b/src/transformers/models/apertus/init.py
@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team and the Swiss AI Initiative. All rights reserved.
+#
+# This code is based on HuggingFace's LLaMA implementation in this library.
+# It has been modified from its original forms to accommodate the architectural
+# differences made by the Swiss AI Initiative that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_apertus import *
+    from .modeling_apertus import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Lysandre	cd74917ffc	Release: v4.56.2	2025-09-17 11:07:24 +02:00
Raushan Turganbay	29261df824	Processor load with multi-processing (#40786 ) push	2025-09-17 11:07:24 +02:00
Anton Vlasjuk	694410d3b2	[`Jetmoe`] Fix RoPE (#40819 ) * fix * remove prints * why was this there...	2025-09-17 10:24:49 +02:00
Pablo Montalvo	240ebfe57e	Fix getter regression (#40824 ) * test things * style * move tests to a sane place	2025-09-17 10:24:30 +02:00
Isotr0py	a55e503138	Fix config dtype parsing for Emu3 edge case (#40766 ) * fix emu3 config Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> * address comment Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> * add comments Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --------- Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-17 10:24:16 +02:00
Cyril Vallez	91393fe4cc	Release: v4.56.1	2025-09-04 22:22:18 +02:00
Raushan Turganbay	3ce5629f1c	[Glm4.5V] fix vLLM support (#40696 ) * fix * add a test case	2025-09-04 22:20:31 +02:00
Wing Lian	26a7e6d76e	fix broken offline mode when loading tokenizer from hub (#40669 ) * fix broken offline mode when loading tokenizer from hub * formatting * make quality * fix import order	2025-09-04 22:20:11 +02:00
Quentin Gallouédec	d56f9162e7	Fix backward compatibility with accelerate in Trainer (#40668 )	2025-09-04 22:19:52 +02:00
Yoni Gozlan	e62b9aae85	Fix self.dropout_p is not defined for SamAttention/Sam2Attention (#40667 ) Fix dropout_p is not defined for SamAttention/Sam2Attention	2025-09-04 22:19:31 +02:00
jiqing-feng	c58d7d7a52	fix pipeline dtype (#40638 ) Signed-off-by: jiqing-feng <jiqing.feng@intel.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-09-04 22:19:13 +02:00
Po-Han Huang (NVIDIA)	ad6b8982a4	Fix broken Llama4 accuracy in MoE part (#40609 ) * Fix broken Llama4 accuracy in MoE part Llama4 accuracy is broken by a bug in https://github.com/huggingface/transformers/pull/39501 . It forgot to transpose the router_scores before applying it to routed_in, causing Llama4 to generate garbage output. This PR fixes that issue by adding back the transpose() and adding some comments explaining why the transpose() is needed. Signed-off-by: Po-Han Huang <pohanh@nvidia.com> * remove comment --------- Signed-off-by: Po-Han Huang <pohanh@nvidia.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>	2025-09-04 22:18:41 +02:00
Lysandre	e7d351ceba	Release: v4.56.0	2025-08-29 20:21:00 +02:00
jiqing-feng	1067577ad2	fix gpt-oss out shape (#40535 ) * fix out shape Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * reset gpt-oss modeling Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix copies Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> --------- Signed-off-by: jiqing-feng <jiqing.feng@intel.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-08-29 15:20:33 +00:00
Raushan Turganbay	7efb4c87ca	Flaky CI is annoying (#40543 ) * mark flaky * and the non batch one	2025-08-29 16:47:44 +02:00
Marc Sun	828a27fd32	Fix gpt-oss rope warning (#40550 ) * fix * fix print * rm * real fix * fix * style	2025-08-29 14:40:33 +00:00
André R.	74a24217f5	Add bfloat16 support detection for MPS in is_torch_bf16_gpu_available() (#40458 ) * Add bfloat16 support detection for MPS (Apple Silicon) in is_torch_bf16_gpu_available bfloat16 seems to have been supported for a few years now in Metal and torch.mps. Make sure to allow it and not throw on bf16 usage with "Your setup doesn't support bf16/gpu." from TrainingArguments. * Check bf16 support for MPS using torch method Actually seems method exists: `5859edf113/torch/_dynamo/device_interface.py (L519)` It simply checks if you are on MacOs 14 or higher. * Document Metal emulation for bf16 support Add note about Metal emulation for bf16 support on M1/M2. * Update bf16 support check for MPS backend is_bf16_supported() not exposed even if defined on MPSInterface, use same approach as in accelerate pr. --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-08-29 14:37:15 +00:00
shanjiaz	ffdd10fced	Allow compression on meta device (#39039 ) * disable gradient calculation for int weights Signed-off-by: shanjiaz <zsjwpianpian@gmail.com> * Update src/transformers/quantizers/quantizer_compressed_tensors.py Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> * updated model procession before/after weight loading Signed-off-by: shanjiaz <zsjwpianpian@gmail.com> * fix style Signed-off-by: shanjiaz <zsjwpianpian@gmail.com> * reformat Signed-off-by: shanjiaz <zsjwpianpian@gmail.com> * fix style Signed-off-by: shanjiaz <zsjwpianpian@gmail.com> --------- Signed-off-by: shanjiaz <zsjwpianpian@gmail.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>	2025-08-29 15:49:15 +02:00
Cyril Vallez	f0e778112f	Clean-up kernel loading and dispatch (#40542 ) * clean * clean imporrts * fix imports * oups * more imports * more imports * more * move it to integrations * fix * style * fix doc	2025-08-29 14:14:38 +02:00
Piyush	f68eb5f135	Redundant code removal (#40534 ) redundant code	2025-08-29 11:30:23 +00:00
Yuanyuan Chen	d888bd435d	Fix typos (#40511 ) Signed-off-by: cyy <cyyever@outlook.com>	2025-08-29 11:25:33 +00:00
Arthur	11a6b95553	Oupsy (#40544 ) fix bump!	2025-08-29 12:59:49 +02:00
Arthur	b07144ac27	`tokenizers` bump tokenizers version (#40540 ) * bump tokenizers version * use rc0 * ? * fml * update	2025-08-29 12:34:41 +02:00
Yih-Dar	008c0ba8e2	Fix `SeamlessM4Tv2ModelWithTextInputTest::test_retain_grad_hidden_states_attentions` (#40532 ) * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-28 23:30:59 +02:00
Yih-Dar	89ef1b6e0b	Set `test_all_params_have_gradient=False` for `HunYuanMoEV1ModelTest` (#40530 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-28 22:32:51 +02:00
Anton Vlasjuk	2e0f1d6a37	[`Qwen Omni/VL`] Fix fa tests (#40528 ) * fix * style * flaky flaky * flaky flaky * oopsie, we need the out of place for sure * flaky flaky * flaky flaky	2025-08-28 21:07:22 +02:00
Manuel de Prada Corral	68013c505a	Improve Gemma3n model and tests (#39764 )	2025-08-28 20:25:42 +02:00
Raushan Turganbay	ffcb344612	Lazy import torchcodec (#40526 ) * lazy import * parse version * omg, we need to guard version parse as well	2025-08-28 18:57:14 +02:00
vamshika-0210	8c7f685079	Fix typo: 'casual' to 'causal' (#40374 ) fix typo: 'casual' to 'causal' Co-authored-by: demo <vamshika0210@gamil.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>	2025-08-28 09:17:37 -07:00
Yih-Dar	d61fab1549	skip some `padding_matches_padding_free_with_position_ids` for FA2 (#40521 ) skip 1 Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-28 17:20:07 +02:00
Yih-Dar	31336ab750	Fix mistral3 tests after "[Kosmos 2.5] Rename checkpoints" (#40523 ) * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-28 16:29:54 +02:00
Arthur	851b8f281d	[`kernels`] If flash attention2 is not installed / fails to import (cc on our cluster) default to kernels (#40178 ) * first step if flash not installed but you set to use it * try importing * now default to using it * update our tests as well * wow yesterday I was not awake * fixup * style * lol the fix was very very simple * `RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/kernels@main#egg=kernels ` for updated dockers * push review comments * fix --------- Co-authored-by: Cyril Vallez <cyril.vallez@huggingface.co> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>	2025-08-28 16:20:25 +02:00
Yih-Dar	de9e2d7a2e	Skip some flex attn tests (#40519 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-28 15:43:38 +02:00
Anton Vlasjuk	7e1aee4db6	[`FA`] Remaining Cleanup (#40424 ) * fa cleanup * flaky tests * readd removed test and changeup comments to reflect the purpose * flaky tests	2025-08-28 15:01:19 +02:00
Raushan Turganbay	893d89e5e6	[omni modality] support composite processor config (#38142 ) * dump ugly option to check again tomorrow * tiny update * do not save as nested dict yet! * fix and add tests * fix dia audio tokenizers * rename the flag and fix new model Evolla * fix style * address comments * broken from different PRp * fix saving layoutLM * delete print * delete!	2025-08-28 14:40:27 +02:00
Cyril Vallez	becab2c601	Use the config for DynamicCache initialization in all modelings (#40420 ) * update all * remove the most horrible old code * style	2025-08-28 14:32:30 +02:00
Marc Sun	8acbbdcadf	[serve] fix `request_id` unexpected (#40501 ) * fix request-id in serving * style * fix	2025-08-28 14:16:28 +02:00
nayana1729	2300be3b41	sped up gguf tokenizer for nemotron test (#40509 ) sped up tokenizer for nemotron test	2025-08-28 12:10:49 +00:00
湛露先生	b2b654afbf	correct kes to keys. (#40489 ) Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>	2025-08-28 12:00:22 +00:00
Pavel Iakubovskii	476cd7bab1	[vision] Improve keypoint-matching models docs (#40497 ) fix options and add inference_mode	2025-08-28 12:31:21 +01:00
NielsRogge	1499f9e356	[Kosmos 2.5] Rename checkpoints (#40338 )	2025-08-28 13:30:41 +02:00
Yuanyuan Chen	10ddfb0be5	Add more missing arguments (#40354 ) Add missing arguments Signed-off-by: cyy <cyyever@outlook.com>	2025-08-28 12:21:51 +02:00
EduardDurech	d10603f701	Add Apertus (#39381 ) * init swissai model * AutoModelForCausalLM * AutoModelForCausalLM mapping * qk norm and post ln optional * fix wrong shape of qk norm: megatron uses head_dim * automodel fixes * minor fix in forward * fix rope validation to accept llama3 scaling * `SwissAIForTokenClassification` support * Align `SwissAI` to v4.52.4 * Align `SwissAI` to v4.53.1 * Init CUDA xIELU * `SwissAI`->`Apertus` * ci fix * check_docstring ignore ApertusConfig * Licensing and placeholder tests * Placeholder doc * XIELU syntax * `_xielu_python` optimization * Fix xIELU * [tmp] `{beta,eps}` persistent=False until {beta,eps} saved in checkpoint * Modular `Apertus` * CUDA xIELU logging * ci fix * ci fix * ci fix * Update license Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> * Update tests/models/apertus/test_modeling_apertus.py Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> * `.utils.import_utils.is_torchdynamo_compiling` * `Apertus` class ordering * `past_key_value{->s}`, `make fix-copies` * ci fix * Remove unused configuration parameters * `{beta,eps}` saved in checkpoint * `{beta,eps}` Temporarily on CPU * Suggestions Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> * ci fix * remove fx_compatible (deprecated) * remove `rotary_embedding_layer` As the tests are written for a config without default scaling (which is not the case in Apertus) - besides, rope scaling is tested in other models so it's all safe. * fully removing `Mask4DTestHard` class Not needed (for now) * switch to `dtype` instead of `torch_dtype` Following this: https://github.com/huggingface/transformers/pull/39782 * remove unused imports * remove `cache_implementation="static"` * +Apertus to `docs/source/en/_toctree.yml` for the doc builder --------- Co-authored-by: Alexander Hagele <alexanderhagele@gmail.com> Co-authored-by: dhia680 <garbayad@gmail.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> Co-authored-by: Dhia Garbaya <84809366+dhia680@users.noreply.github.com>	2025-08-28 11:55:43 +02:00
jiqing-feng	f9b9a5e884	Update quantization overview for XPU (#40331 ) * update xpu quantization overview Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix aqlm tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update gguf support Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix gguf tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix xpu gguf precision error Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * replace deprecated models Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix import org Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update xpu ggml tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * revert wrong change Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix xpu tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * xpu optimum-quanto goes green Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> --------- Signed-off-by: jiqing-feng <jiqing.feng@intel.com> Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>	2025-08-28 09:52:59 +00:00
GuoChenxu	b824f4986f	fix typo (#40484 ) * fix typo Signed-off-by: guochenxu <guochenxu@modelbest.cn> * csm & qwen omni Signed-off-by: guochenxu <guochenxu@modelbest.cn> * format Signed-off-by: guochenxu <guochenxu@modelbest.cn> * Apply style fixes * omni Signed-off-by: guochenxu <guochenxu@modelbest.cn> --------- Signed-off-by: guochenxu <guochenxu@modelbest.cn> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2025-08-28 08:31:25 +00:00
Rémi Ouazan	c9ff166718	Various AMD expectations (#40510 ) * AMD expectations for qwen2 * Added more detailled excpectation to smolvlm * Added AMD expectations to TableTransformer * Style	2025-08-28 10:15:21 +02:00
ivarflakstad	721d4aee81	Include machine type in collated reports filename (#40514 )	2025-08-28 09:28:12 +02:00
Cyril Vallez	98289c5546	[modular] Classes can now be defined and referenced in arbitrary order (without bringing unwanted dependencies) (#40507 ) * remove future class from dependency graph * convert all	2025-08-27 23:06:10 +02:00
Bryan	e3d8fd730e	docs(pixtral): Update Pixtral model card to new format (#40442 ) * docs(pixtral): Update Pixtral model card to new format * docs(pixtral): Change cuda into auto for device_map * docs(pixtral): Apply suggestions from review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * docs(pixtral): Apply suggestions from review, changing mistral-community into Mistral AI Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * docs(pixtral): Apply suggestions from review [!TIP] part Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * docs(pixtral): Finalize model card with tested code examples This commit finalizes the update for the Pixtral model card. * Fix the hfoption by the right one * @BryanBradfo docs(pixtral): Changing the redirection of bitsandbytes Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * docs(pixtral): Add of ` to highlight the tokens Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * docs(pixtral): Move image block per final review --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>	2025-08-27 11:38:51 -07:00
Yih-Dar	821384d5d4	Fix the CI workflow of `merge to main` (#40503 ) * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-27 18:35:12 +02:00
ivarflakstad	304225aa15	Collated reports: no need to upload artifact (#40502 ) No need to upload collated reports as gh artifact	2025-08-27 18:31:55 +02:00
ivarflakstad	3c343c6601	[Whisper] Add rocm expected results to certain tests (#40482 ) * Add rocm expected results to certain tests * Specify rocm version in expectations so we know origin. Improved var names * Update test var names	2025-08-27 16:19:11 +00:00
Yih-Dar	6350636964	Fix `qwen2_moe` tests (#40494 ) update Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-27 16:22:04 +02:00
StevenBucaille	52aaa3f500	[EfficientLoFTR] dynamic image size support (#40329 ) * fix: reverted efficientloftr embeddings computation to inference time with lru cache * fix: added dtype and device for torch ones and zeros creation * fix: fixed embed height and width computation with aggregation * fix: make style * fix error message * fix fa2 tests --------- Co-authored-by: qubvel <qubvel@gmail.com>	2025-08-27 15:05:08 +01:00
Raushan Turganbay	ed5dd2999c	[ESM] support attention API (#40370 ) * ESM supports attention API * supports flags * fix tests * fix copiees * another fixup needed after fixing tests * fix tests and make sure Evolla copied everything * fix * order * forgot about "is_causal" for fa2 * cross attention can't be causal	2025-08-27 15:39:04 +02:00
Cyril Vallez	8b804311ba	[modular] Remove ambiguity in all calls to parent class methods + fix dependency graph (#40456 ) * fix in modular * remove leftover print * fix everything except when it's in assignment * fix assignment as well * more general * better * better * better comment * docstring * cleaner * remove base * doc	2025-08-27 14:51:28 +02:00
Cyril Vallez	a3afebbbbe	[modular] Use multi-processing + fix model import issue (#40481 ) * add mp and simplify a bit * improve * fix * fix imports * nit	2025-08-27 14:51:12 +02:00
zifeitong	75d6f17de6	Validate GptOssConfig rope config after it's fully initialized (#40474 ) * Validate GptOssConfig rope config after it's fully initialized Fixes #40461 * Remove whitespaces	2025-08-27 10:16:58 +01:00
Yih-Dar	80f4c0c6a0	CI when PR merged to `main` (#40451 ) * up * up * up * up * up * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-27 10:56:18 +02:00
Yih-Dar	ff8b88a948	Fix nightly torch CI (#40469 ) Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-26 22:02:15 +02:00
Yih-Dar	74ad608a2b	Not to shock AMD team by the cancelled workflow run notification ❤️ 💖 (#40467 )	2025-08-26 20:53:24 +02:00
SowmiyaNarayanan G	c8c7623f20	Update SegFormer model card (#40417 ) * Update SegFormer model card * Update docs/source/en/model_doc/segformer.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/segformer.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/segformer.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/segformer.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/segformer.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/segformer.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/segformer.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update the segformer model card * Remove quantization example --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>	2025-08-26 08:27:25 -07:00