solve unequal cropping

use existing methods, add default image
add an unnormalize image method
2025-10-20 17:13:56 +08:00 · 2025-08-11 19:20:28 +02:00 · 2025-08-11 16:44:06 +02:00 · 2025-08-11 16:43:27 +02:00 · 2025-08-06 19:19:09 +02:00 · 2025-08-06 19:17:38 +02:00
1287 changed files with 57816 additions and 25029 deletions
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -18,10 +18,6 @@ jobs:
      notebook_folder: transformers_doc
      languages: ar de en es fr hi it ko pt tr zh ja te
      custom_container: huggingface/transformers-doc-builder
-      # Temporary pin to work around datasets exception in the docbuilder.Remove after docker images and main have
-      # the right dependencies (which **should** be the case by 2025-07-20). See
-      # https://github.com/huggingface/transformers/actions/runs/16365952006/job/46243081358?pr=38545
-      pre_command: uv pip install datasets>=2.15.0
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -15,7 +15,3 @@ jobs:
      pr_number: ${{ github.event.number }}
      package: transformers
      languages: en
-      # Temporary pin to work around datasets exception in the docbuilder. Remove after docker images and main have
-      # the right dependencies (which **should** be the case by 2025-07-20). See
-      # https://github.com/huggingface/transformers/actions/runs/16365952006/job/46243081358?pr=38545
-      pre_command: uv pip install datasets>=2.15.0
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -31,7 +31,7 @@ jobs:
      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -18,7 +18,7 @@ jobs:
      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      job_splits: ${{ steps.set-matrix.outputs.job_splits }}
      split_keys: ${{ steps.set-matrix.outputs.split_keys }}
--- a/.github/workflows/pr_build_doc_with_comment.yml
+++ b/.github/workflows/pr_build_doc_with_comment.yml
@ -0,0 +1,134 @@
+name: PR - build doc via comment
+on:
+  issue_comment:
+    types:
+      - created
+    branches-ignore:
+      - main
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'build-doc') }}
+  cancel-in-progress: true
+permissions: {}
+
+
+jobs:
+  get-pr-number:
+    name: Get PR number
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
+    uses: ./.github/workflows/get-pr-number.yml
+
+  get-pr-info:
+    name: Get PR commit SHA
+    needs: get-pr-number
+    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    uses: ./.github/workflows/get-pr-info.yml
+    with:
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+
+  verity_pr_commit:
+    name: Verity PR commit corresponds to a specific event by comparing timestamps
+    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    runs-on: ubuntu-22.04
+    needs: get-pr-info
+    env:
+      COMMENT_DATE: ${{ github.event.comment.created_at }}
+      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
+      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
+    steps:
+      - run: |
+          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
+          echo "COMMENT_DATE: $COMMENT_DATE"
+          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
+          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
+          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
+          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
+            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
+            exit -1;
+          fi
+
+  create_run:
+    name: Create run
+    needs: [get-pr-number, get-pr-info]
+    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
+    permissions:
+      statuses: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Create Run
+        id: create_run
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
+          # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
+            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Custom doc building job" -f "context=custom-doc-build"
+
+  reply_to_comment:
+    name: Reply to the comment
+    if: ${{ needs.create_run.result == 'success' }}
+    needs: [get-pr-number, create_run]
+    permissions:
+      pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Reply to the comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
+            -f "body=[Building docs for all languages...](${{ env.GITHUB_RUN_URL }})"
+
+  build-doc:
+    name: Build doc
+    needs: [get-pr-number, get-pr-info]
+    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+      package: transformers
+      languages: ar de en es fr hi it ko pt tr zh ja te
+
+  update_run_status:
+    name: Update Check Run Status
+    needs: [ get-pr-info, create_run, build-doc ]
+    permissions:
+      statuses: write
+    if: ${{ always() && needs.create_run.result == 'success' }}
+    runs-on: ubuntu-22.04
+    env:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.create_run.result) }}
+    steps:
+      - name: Get `build-doc` job status
+        run: |
+          echo "${{ needs.build-doc.result }}"
+          echo $STATUS_OK
+          if [ "$STATUS_OK" = "true" ]; then
+            echo "STATUS=success" >> $GITHUB_ENV
+          else
+            echo "STATUS=failure" >> $GITHUB_ENV
+          fi
+
+      - name: Update PR commit statuses
+        run: |
+          echo "${{ needs.build-doc.result }}"
+          echo "${{ env.STATUS }}"
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
+            -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Custom doc building job" -f "context=custom-doc-build"
--- a/.github/workflows/pr_run_slow_ci.yml
+++ b/.github/workflows/pr_run_slow_ci.yml
@ -16,28 +16,6 @@ jobs:
    with:
      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}

-  # We only need to verify the timestamp if the workflow is triggered by `issue_comment`.
-  verity_pr_commit:
-    name: Verity PR commit corresponds to a specific event by comparing timestamps
-    if: ${{ github.event.comment.created_at != '' }}
-    runs-on: ubuntu-22.04
-    needs: get-pr-info
-    env:
-      COMMENT_DATE: ${{ github.event.comment.created_at }}
-      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
-      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
-    steps:
-      - run: |
-          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
-          echo "COMMENT_DATE: $COMMENT_DATE"
-          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
-          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
-          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
-          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
-            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
-            exit -1;
-          fi
-
  get-jobs:
    name: Get test files to run
    runs-on: ubuntu-22.04
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -36,7 +36,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      test_map: ${{ steps.set-matrix.outputs.test_map }}
@ -136,7 +136,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
@ -362,7 +362,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
--- a/.github/workflows/self-scheduled-amd-mi325-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi325-caller.yml
@ -0,0 +1,63 @@
+name: Self-hosted runner scale set (AMD mi325 scheduled CI caller)
+
+# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
+# For example, 1gpu scale set: amd-mi325-ci-1gpu
+#              2gpu scale set: amd-mi325-ci-2gpu
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_scheduled_ci_caller*
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi325-ci
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi325
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi325-ci
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi325
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi325-ci
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi325
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi325-ci
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi325
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -55,7 +55,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
@ -219,7 +219,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
--- a/README.md
+++ b/README.md
@ -44,7 +44,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
@ -242,7 +242,7 @@ pipeline(

 - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
 - The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
+- The [example scripts](https://github.com/huggingface/transformers/tree/main/examples) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.

 ## 100 projects using Transformers

@ -280,8 +280,8 @@ Expand each modality below to see a few example models for various use cases.
 - Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
 - Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
 - Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
- Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue)
+- Keypoint detection with [SuperPoint](https://huggingface.co/magic-leap-community/superpoint)
+- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
 - Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
 - Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
 - Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
--- a/conftest.py
+++ b/conftest.py
@ -23,13 +23,12 @@ from os.path import abspath, dirname, join
 import _pytest
 import pytest

-from transformers.testing_utils import HfDoctestModule, HfDocTestParser
+from transformers.testing_utils import HfDoctestModule, HfDocTestParser, is_torch_available


 NOT_DEVICE_TESTS = {
    "test_tokenization",
    "test_tokenization_mistral_common",
-    "test_processor",
    "test_processing",
    "test_beam_constraints",
    "test_configuration_utils",
@ -128,3 +127,10 @@ class CustomOutputChecker(OutputChecker):
 doctest.OutputChecker = CustomOutputChecker
 _pytest.doctest.DoctestModule = HfDoctestModule
 doctest.DocTestParser = HfDocTestParser
+
+if is_torch_available():
+    import torch
+
+    # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
+    # We set it to `False` for CI. See https://github.com/pytorch/pytorch/issues/157274#issuecomment-3090791615
+    torch.backends.cudnn.allow_tf32 = False
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch<2.8' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -16,7 +16,7 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch<2.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@ -6,7 +6,7 @@ RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"


--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,7 +7,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,11 +1,8 @@
-FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
+FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

-ARG TORCH_VISION='0.21.0'
-ARG TORCH_AUDIO='2.6.0'
-
 RUN apt update && \
    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
    apt clean && \
@ -23,9 +20,12 @@ WORKDIR /
 ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
+# On ROCm, torchcodec is required to decode audio files
+# RUN python3 -m pip install --no-cache-dir torchcodec
+# Install transformers
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video,audio]

+# Remove tensorflow and flax as they are no longer supported by transformers
 RUN python3 -m pip uninstall -y tensorflow flax

 # When installing in editable mode, `transformers` is not recognized as a package.
@ -36,4 +36,4 @@ RUN cd transformers && python3 setup.py develop
 RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y

 # `kernels` may causes many failing tests
-RUN python3 -m pip uninstall -y kernels
+RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -78,6 +78,10 @@ RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submod
 # RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
 # RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git

+# Add fp-quant for quantization testing
+# Requires py3.11 but our CI runs on 3.9
+# RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
+
 # Add compressed-tensors for quantization testing
 RUN python3 -m pip install --no-cache-dir compressed-tensors

--- a/docs/source/ar/custom_models.md
+++ b/docs/source/ar/custom_models.md
@ -280,7 +280,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 أو من دفتر ملاحظات:
--- a/docs/source/ar/llm_tutorial_optimization.md
+++ b/docs/source/ar/llm_tutorial_optimization.md
@ -13,11 +13,11 @@

 في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:

-1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization.md).
+1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization).

 2.  **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.

-3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)]((https://huggingface.co/papers/2305.13245)).
+3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)](https://huggingface.co/papers/2305.13245).

 على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.

--- a/docs/source/ar/model_sharing.md
+++ b/docs/source/ar/model_sharing.md
@ -41,7 +41,7 @@ picture-in-picture" allowfullscreen></iframe>
 قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك.  إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):

 ```bash
-huggingface-cli login
+hf auth login
 ```

 إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
--- a/docs/source/ar/run_scripts.md
+++ b/docs/source/ar/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
--- a/docs/source/de/model_sharing.md
+++ b/docs/source/de/model_sharing.md
@ -56,7 +56,7 @@ Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können
 Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -72,8 +72,6 @@
      title: Caching
    - local: kv_cache
      title: KV cache strategies
-    - local: serving
-      title: Serving
    - local: llm_tutorial_optimization
      title: Getting the most out of LLMs
    - local: perplexity
@ -91,6 +89,18 @@
    - local: chat_extras
      title: Tools and RAG
    title: Chat with models
+  - sections:
+      - local: serving
+        title: Serving LLMs, VLMs, and other chat-based models
+      - local: jan
+        title: Jan
+      - local: cursor
+        title: Cursor
+      - local: tiny_agents
+        title: Tiny-Agents CLI and MCP tools
+      - local: open_webui
+        title: Open WebUI
+    title: Serving
  - sections:
    - local: perf_torch_compile
      title: torch.compile
@ -105,6 +115,8 @@
    title: Agents
  - local: tools
    title: Tools
+  - local: transformers_as_backend
+    title: Inference server backends
  title: Inference
 - isExpanded: false
  sections:
@ -177,6 +189,8 @@
    title: FBGEMM
  - local: quantization/finegrained_fp8
    title: Fine-grained FP8
+  - local: quantization/fp_quant
+    title: FP-Quant
  - local: gguf
    title: GGUF
  - local: quantization/gptq
@ -449,6 +463,8 @@
        title: ErnieM
      - local: model_doc/esm
        title: ESM
+      - local: model_doc/exaone4
+        title: EXAONE-4.0
      - local: model_doc/falcon
        title: Falcon
      - local: model_doc/falcon3
@ -495,6 +511,8 @@
        title: GPT2
      - local: model_doc/gpt_bigcode
        title: GPTBigCode
+      - local: model_doc/gpt_oss
+        title: GptOss
      - local: model_doc/gptsan-japanese
        title: GPTSAN Japanese
      - local: model_doc/gpt-sw3
@ -693,6 +711,8 @@
        title: XLM-V
      - local: model_doc/xlnet
        title: XLNet
+      - local: model_doc/xlstm
+        title: xLSTM
      - local: model_doc/yoso
        title: YOSO
      - local: model_doc/zamba
@ -721,6 +741,10 @@
        title: DAB-DETR
      - local: model_doc/deepseek_v2
        title: DeepSeek-V2
+      - local: model_doc/deepseek_vl
+        title: DeepseekVL
+      - local: model_doc/deepseek_vl_hybrid
+        title: DeepseekVLHybrid
      - local: model_doc/deformable_detr
        title: Deformable DETR
      - local: model_doc/deit
@ -747,6 +771,8 @@
        title: DPT
      - local: model_doc/efficientformer
        title: EfficientFormer
+      - local: model_doc/efficientloftr
+        title: EfficientLoFTR
      - local: model_doc/efficientnet
        title: EfficientNet
      - local: model_doc/eomt
@ -957,6 +983,8 @@
        title: CLIPSeg
      - local: model_doc/clvp
        title: CLVP
+      - local: model_doc/cohere2_vision
+        title: Cohere2Vision
      - local: model_doc/colpali
        title: ColPali
      - local: model_doc/colqwen2
@ -969,6 +997,8 @@
        title: Donut
      - local: model_doc/emu3
        title: Emu3
+      - local: model_doc/evolla
+        title: Evolla
      - local: model_doc/flava
        title: FLAVA
      - local: model_doc/gemma3
@ -1033,6 +1063,8 @@
        title: Mistral3
      - local: model_doc/mllama
        title: mllama
+      - local: model_doc/mm-grounding-dino
+        title: MM Grounding DINO
      - local: model_doc/nougat
        title: Nougat
      - local: model_doc/omdet-turbo
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -72,6 +72,34 @@ model(torch.ones(1, 5, dtype=int))
 and it will stop printing the statements, as it now uses the `sdpa` attention.  
 This allows to quickly change an attention function, without needing to reload the model!

+## Different attention per backbone in multimodal models
+
+For multimodal models different attention functions may work better for each backbone module. For example, some vision backbones perform better in fp32, but are incompatible with FlashAttention. To continue using FlashAttention while keeping the vision encoder in fp32, create a dict and map each config to an attention implementation as shown below.
+
+```python
+from transformers import AutoModelForImageTextToText
+
+model_id = "facebook/chameleon-7b"
+
+attention_implementation_per_backbone = {"vision_config": "sdpa", "text_config": "flash_attention_2"}
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation=attention_implementation_per_backbone)
+
+# NOTE: keys in the attention implementation have to be the same as the sub-config names
+for key in attention_implementation_per_backbone:
+    assert key in model.config.sub_configs, f"Invalid key in `attention_implementation`"
+
+# You can omit certain backbones - the default attention function (SDPA) will be used
+# This is equivalent to the previous example
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2"})
+
+
+# Set the same attention implementation for all backbones with single string, same as in non-multimodal models
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
+
+# Alternatively use a dict with an empty key for global configuration
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"": "eager"})
+```
+
 ## What about new args needed in my custom attention function?

 But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -82,22 +82,18 @@ When you use Transformers' [`Cache`] class, the self-attention module performs s

 ## Cache storage implementation

-The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].
+Caches are structured as a list of layers, where each layer contains a key and value cache. The key and value caches are tensors with the shape `[batch_size, num_heads, seq_len, head_dim]`.

+Layers can be of different types (e.g. `DynamicLayer`, `StaticLayer`, `SlidingWindowLayer`), which mostly changes how sequence length is handled and how the cache is updated.

-In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
- `key_cache`: A list of tensors, one for each layer.
- `value_cache`: A list of tensors, one for each layer.
+The simplest is a `DynamicLayer` that grows as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token:

-When new tokens are processed:
-
-1. For each layer, the new key and value states are concatenated with the existing cache.
 ```py
-self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+cache.layers[idx].keys = torch.cat([cache.layers[idx].keys, key_states], dim=-2)
+cache.layers[idx].values = torch.cat([cache.layers[idx].values, value_states], dim=-2)
 ```

-2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
+Other layer types like `StaticLayer` and `SlidingWindowLayer` have a fixed sequence length that is set when the cache is created. This makes them compatible with `torch.compile`. In the case of `SlidingWindowLayer`, existing tokens are shifted out of the cache when a new token is added.

 The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.

@ -132,6 +128,34 @@ for _ in range(max_new_tokens):
 print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
 "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
 ```
+
+## Cache position
+
+The cache position tracks where to insert new tokens in the attention cache. It represents the *absolute* position of each token in the context, independent of padding or batch structure. Suppose you already cached `N` tokens and are now processing `K` new tokens. The cache position for the new tokens will range from `N` to `N + K - 1`. In other words, you're processing tokens at positions - `[N, N + 1, N + 2, ..., N + K - 1]`.
+
+Cache position is used internally for two purposes:
+
+1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
+2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, like [`StaticCache`], that pre-allocates a specific cache length.
+
+The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
+
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [{"role": "user", "content": "You are a helpful assistant."}]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
+
+```
+
+
 ## Legacy cache format

 Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
@ -157,4 +181,4 @@ generation_outputs = model.generate(**inputs, return_dict_in_generate=True, retu

 cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
 legacy_format_cache = cache.to_legacy_cache()
-```
+```
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -111,6 +111,7 @@ Some vision models also support video inputs. The message format is very similar

 - The content `"type"` should be `"video"` to indicate the content is a video.
 - For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
+- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.

 > [!WARNING]
 > Loading a video from `"url"` is only supported by the PyAV or Decord backends.
@ -137,6 +138,52 @@ messages = [
 ]
 ```

+### Example: Passing decoded video objects
+```python
+import numpy as np
+
+video_object1 = np.random.randint(0, 255, size=(16, 224, 224, 3), dtype=np.uint8),
+
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "video": video_object1},
+            {"type": "text", "text": "What do you see in this video?"}
+        ],
+    },
+]
+```
+You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages.
+```python
+
+# Make sure a video backend library (pyav, decord, or torchvision) is available.
+from transformers.video_utils import load_video
+
+# load a video file in memory for testing
+video_object2, _ = load_video(
+    "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
+)
+
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "video": video_object2},
+            {"type": "text", "text": "What do you see in this video?"}
+        ],
+    },
+]
+```
+
 Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.

 The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -27,7 +27,7 @@ This guide shows you how to quickly start chatting with Transformers from the co

 ## chat CLI

-After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
+After you've [installed Transformers](./installation), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
 transformers chat Qwen/Qwen2.5-0.5B-Instruct
@ -158,4 +158,4 @@ The easiest solution for improving generation speed is to either quantize a mode
 You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.

 > [!TIP]
-> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
+> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
--- a/docs/source/en/cursor.md
+++ b/docs/source/en/cursor.md
@ -0,0 +1,42 @@
+# Using Cursor as a client of transformers serve
+
+This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. In this particular case, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
+
+To launch a server with CORS enabled, run
+
+```shell
+transformers serve --enable-cors
+```
+
+You'll also need to expose your server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
+
+```shell
+ngrok http [port]
+```
+
+where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see a https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
+</h3>
+
+You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
+1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
+2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
+3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
+4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
+5. Hit "Verify".
+
+After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
+</h3>
+
+You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
+</h3>
+
+
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@ -271,7 +271,7 @@ The model is ready to be pushed to the Hub now. Log in to your Hugging Face acco
 <hfoption id="huggingface-CLI">

 ```bash
-huggingface-cli login
+hf auth login
 ```

 </hfoption>
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -356,66 +356,93 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 ## Caches

-[[autodoc]] Cache
-    - update
-
-[[autodoc]] CacheConfig
-	- update
-
-[[autodoc]] QuantizedCacheConfig
-	- validate
-
-[[autodoc]] DynamicCache
+[[autodoc]] CacheLayerMixin
    - update
    - get_seq_length
+    - get_mask_sizes
+    - get_max_cache_shape
+    - reset
    - reorder_cache
+
+[[autodoc]] DynamicLayer
+    - update
+    - crop
+    - batch_repeat_interleave
+    - batch_select_indices
+
+[[autodoc]] StaticLayer
+    - update
+
+[[autodoc]] SlidingWindowLayer
+    - update
+
+[[autodoc]] CacheProcessor
+    - pre_update
+    - post_update
+
+[[autodoc]] OffloadedCacheProcessor
+    - pre_update
+
+[[autodoc]] QuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] QuantoQuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] HQQQuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] Cache
+    - update
+    - get_seq_length
+    - get_mask_sizes
+    - get_max_cache_shape
+    - reset
+    - reorder_cache
+    - crop
+    - batch_repeat_interleave
+    - batch_select_indices
+
+[[autodoc]] DynamicCache
    - to_legacy_cache
    - from_legacy_cache

 [[autodoc]] QuantizedCache
-    - update
-    - get_seq_length

 [[autodoc]] QuantoQuantizedCache

+[[autodoc]] QuantoQuantizedCacheProcessor
+
 [[autodoc]] HQQQuantizedCache

+[[autodoc]] HQQQuantizedCacheProcessor
+
 [[autodoc]] OffloadedCache
-    - update
-    - prefetch_layer
-    - evict_previous_layer

 [[autodoc]] StaticCache
-    - update
-    - get_seq_length
-    - reset

 [[autodoc]] OffloadedStaticCache
-    - update
-    - get_seq_length
-    - reset

 [[autodoc]] HybridCache
-    - update
-    - get_seq_length
-    - reset
+
+[[autodoc]] HybridChunkedCache

 [[autodoc]] SlidingWindowCache
-    - update
-    - reset

 [[autodoc]] EncoderDecoderCache
-    - get_seq_length
    - to_legacy_cache
    - from_legacy_cache
-    - reset
-    - reorder_cache

 [[autodoc]] MambaCache
    - update_conv_state
    - update_ssm_state
    - reset

+[[autodoc]] CacheConfig
+
+[[autodoc]] QuantizedCacheConfig
+
+
 ## Watermark Utils

 [[autodoc]] WatermarkingConfig
--- a/docs/source/en/jan.md
+++ b/docs/source/en/jan.md
@ -0,0 +1,32 @@
+# Jan: using the serving API as a local LLM provider
+
+This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.
+
+## Running models locally
+
+To connect `transformers serve` with Jan, you'll need to set up a new model provider ("Settings" > "Model Providers"). Click on "Add Provider", and set a new name. In your new model provider page, all you need to set is the "Base URL" to the following pattern:
+
+```shell
+http://[host]:[port]/v1
+```
+
+where `host` and `port` are the `transformers serve` CLI parameters (`localhost:8000` by default). After setting this up, you should be able to see some models in the "Models" section, hitting "Refresh". Make sure you add some text in the "API key" text field too -- this data is not actually used, but the field can't be empty. Your custom model provider page should look like this:
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_jan_model_providers.png"/>
+</h3>
+
+You are now ready to chat!
+
+> [!TIP]
+> You can add any `transformers`-compatible model to Jan through `transformers serve`. In the custom model provider you created, click on the "+" button in the "Models" section and add its Hub repository name, e.g. `Qwen/Qwen3-4B`.
+
+## Running models on a separate machine
+
+To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
+
+```
+ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
+```
+
+Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -134,7 +134,7 @@ The [`QuantizedCache`] reduces memory requirements by quantizing the KV values t
 > [!WARNING]
 > Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.

-Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
+Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and the quantization backend, as well as any additional quantization related parameters should also be passed either as a dict. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.

 <hfoptions id="quantized-cache">
 <hfoption id="HQQQuantizedCache">
@ -143,7 +143,7 @@ For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value`

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@ -161,7 +161,7 @@ For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-valu

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@ -275,7 +275,6 @@ from transformers.cache_utils import (
    StaticCache,
    SlidingWindowCache,
    QuantoQuantizedCache,
-    QuantizedCacheConfig,
 )

 model_id = "meta-llama/Llama-2-7b-chat-hf"
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -341,7 +341,7 @@ A known issue with transformer models is that the self-attention mechanism grows

 FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to the GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead.

-To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`].
+To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or set with `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface) after the model is loaded.

 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@ -353,6 +353,14 @@ model = AutoModelForCausalLM.from_pretrained(
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
 )
+
+# Change the model's attention dynamically after loading
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2b",
+    quantization_config=quant_config,
+    torch_dtype=torch.bfloat16
+)
+model.set_attention_implementation("flash_attention_2")
 ```

 ### PyTorch scaled dot product attention
@ -360,7 +368,7 @@ model = AutoModelForCausalLM.from_pretrained(
 Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation.

 > [!TIP]
-> SDPA automaticallysupports FlashAttention-2 as long as you have the latest PyTorch version installed.
+> SDPA automatically supports FlashAttention-2 as long as you have the latest PyTorch version installed.

 Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -148,9 +148,9 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | Option name | Type | Simplified description |
 |---|---|---|
 | `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
-| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
+| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies) for more information. |
 | `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
-| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
+| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies) for more information. |
 | `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
 | `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |

--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@ -23,11 +23,11 @@ The crux of these challenges lies in augmenting the computational and memory cap

 In this guide, we will go over the effective techniques for efficient LLM deployment:

-1.  **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.
+1.  **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization) can achieve computational advantages without a considerable decline in model performance.

 2.  **Flash Attention:** Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization.

-3.  **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)]((https://huggingface.co/papers/2305.13245)).
+3.  **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)](https://huggingface.co/papers/2305.13245).

 Throughout this guide, we will offer an analysis of auto-regressive generation from a tensor's perspective. We delve into the pros and cons of adopting lower precision, provide a comprehensive exploration of the latest attention algorithms, and discuss improved LLM architectures. While doing so, we run practical examples showcasing each of the feature improvements.

--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
  it's the second one).
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
  or tensorboardX).
+- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:

 [[autodoc]] integrations.TensorBoardCallback

+[[autodoc]] integrations.TrackioCallback
+    - setup
+
 [[autodoc]] integrations.WandbCallback
    - setup

--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] HqqConfig

+## Mxfp4Config
+
+[[autodoc]] Mxfp4Config
+
 ## FbgemmFp8Config

 [[autodoc]] FbgemmFp8Config
@ -93,6 +97,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] QuarkConfig

+## FPQuantConfig
+
+[[autodoc]] FPQuantConfig
+
 ## AutoRoundConfig

 [[autodoc]] AutoRoundConfig
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -258,6 +258,10 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForKeypointDetection

+### AutoModelForKeypointMatching
+
+[[autodoc]] AutoModelForKeypointMatching
+
 ### AutoModelForMaskedImageModeling

 [[autodoc]] AutoModelForMaskedImageModeling
--- a/docs/source/en/model_doc/barthez.md
+++ b/docs/source/en/model_doc/barthez.md
@ -14,49 +14,81 @@ rendered properly in your Markdown viewer.

 -->

-# BARThez
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+    </div>
 </div>

-## Overview
+# BARThez

-The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://huggingface.co/papers/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
-2020.
+[BARThez](https://huggingface.co/papers/2010.12321) is a [BART](./bart) model designed for French language tasks. Unlike existing French BERT models, BARThez includes a pretrained encoder-decoder, allowing it to generate text as well. This model is also available as a multilingual variant, mBARThez, by continuing pretraining multilingual BART on a French corpus.

-The abstract of the paper:
+You can find all of the original BARThez checkpoints under the [BARThez](https://huggingface.co/collections/dascim/barthez-670920b569a07aa53e3b6887) collection.
+
+> [!TIP]
+> This model was contributed by [moussakam](https://huggingface.co/moussakam).
+> Refer to the [BART](./bart) docs for more usage examples.


-*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
-(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
-understanding tasks. While there are some notable exceptions, most of the available models and research have been
-conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
-(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
-that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
-CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
-its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
-summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
-pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
-provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
+The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.

-This model was contributed by [moussakam](https://huggingface.co/moussakam). The Authors' code can be found [here](https://github.com/moussaKam/BARThez).
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-<Tip> 
+```py
+import torch
+from transformers import pipeline

-BARThez implementation is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on 
-configuration classes and their parameters. BARThez-specific tokenizers are documented below.  
+pipeline = pipeline(
+    task="fill-mask",
+    model="moussaKam/barthez",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.")
+```

-</Tip>
+</hfoption>
+<hfoption id="AutoModel">

-## Resources
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer

- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
-  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
+tokenizer = AutoTokenizer.from_pretrained(
+    "moussaKam/barthez",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "moussaKam/barthez",
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to("cuda")

+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Les plantes produisent <mask> grâce à un processus appelé photosynthèse." | transformers run --task fill-mask --model moussaKam/barthez --device 0
+```
+
+</hfoption>
+</hfoptions>

 ## BarthezTokenizer

--- a/docs/source/en/model_doc/clap.md
+++ b/docs/source/en/model_doc/clap.md
@ -14,25 +14,50 @@ rendered properly in your Markdown viewer.

 -->

-# CLAP
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
 </div>

-## Overview
+# CLAP

-The CLAP model was proposed in [Large Scale Contrastive Language-Audio pretraining with
-feature fusion and keyword-to-caption augmentation](https://huggingface.co/papers/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+[CLAP (Contrastive Language-Audio Pretraining)](https://huggingface.co/papers/2211.06687) is a multimodal model that combines audio data with natural language descriptions through contrastive learning.

-CLAP (Contrastive Language-Audio Pretraining) is a neural network trained on a variety of (audio, text) pairs. It can be instructed in to predict the most relevant text snippet, given an audio, without directly optimizing for the task. The CLAP model uses a SWINTransformer to get audio features from a log-Mel spectrogram input, and a RoBERTa model to get text features. Both the text and audio features are then projected to a latent space with identical dimension. The dot product between the projected audio and text features is then used as a similar score.
+It incorporates feature fusion and keyword-to-caption augmentation to process variable-length audio inputs and to improve performance. CLAP doesn't require task-specific training data and can learn meaningful audio representations through natural language.

-The abstract from the paper is the following:
+You can find all the original CLAP checkpoints under the [CLAP](https://huggingface.co/collections/laion/clap-contrastive-language-audio-pretraining-65415c0b18373b607262a490) collection.

-*Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-6*
+> [!TIP]
+> This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
+>
+> Click on the CLAP models in the right sidebar for more examples of how to apply CLAP to different audio retrieval and classification tasks.

-This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ) .
-The original code can be found [here](https://github.com/LAION-AI/Clap).
+The example below demonstrates how to extract text embeddings with the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+
+model = AutoModel.from_pretrained("laion/clap-htsat-unfused", torch_dtype=torch.float16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+texts = ["the sound of a cat", "the sound of a dog", "music playing"]
+
+inputs = tokenizer(texts, padding=True, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    text_features = model.get_text_features(**inputs)
+
+print(f"Text embeddings shape: {text_features.shape}")
+print(f"Text embeddings: {text_features}")
+```
+
+</hfoption>
+</hfoptions>

 ## ClapConfig

--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@ -1,43 +1,115 @@
-# Cohere
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
 </div>

-## Overview
-[C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages.

-The model features three layers with sliding window attention (window size 4096) and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence.
+# Cohere2

-The model has been trained on 23 languages: English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Arabic, Chinese, Russian, Polish, Turkish, Vietnamese, Dutch, Czech, Indonesian, Ukrainian, Romanian, Greek, Hindi, Hebrew, and Persian.
+[Cohere Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model. It is a multilingual model trained on 23 languages and has a context window of 128k. The model features three layers with sliding window attention and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence.

-## Usage tips
-The model and tokenizer can be loaded via:
+This model is optimized for speed, cost-performance, and compute resources.
+
+You can find all the original Command-R checkpoints under the [Command Models](https://huggingface.co/collections/CohereForAI/command-models-67652b401665205e17b192ad) collection.
+
+
+> [!TIP]
+> Click on the Cohere models in the right sidebar for more examples of how to apply Cohere to different language tasks.
+
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class, and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
-# pip install transformers
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation", 
+    model="CohereLabs/c4ai-command-r7b-12-2024",
+    torch_dtype=torch.float16,
+    device_map=0
+)
+
+messages = [
+    {"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"},
+]
+pipeline(messages)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

-model_id = "CohereForAI/c4ai-command-r7b-12-2024"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
+model = AutoModelForCausalLM.from_pretrained(
+    "CohereLabs/c4ai-command-r7b-12-2024", 
+    torch_dtype=torch.float16, 
+    device_map="auto", 
+    attn_implementation="sdpa"
+)

-# Format message with the command-r chat template
-messages = [{"role": "user", "content": "Hello, how are you?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
-
-gen_tokens = model.generate(
+# format message with the Command-R chat template
+messages = [{"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+output = model.generate(
    input_ids,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.3,
+    cache_implementation="static",
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+# pip install -U flash-attn --no-build-isolation
+transformers-cli chat CohereLabs/c4ai-command-r7b-12-2024 --torch_dtype auto --attn_implementation flash_attention_2
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview.md) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to 4-bits.
+
+```python
+import torch
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
+model = AutoModelForCausalLM.from_pretrained(
+    "CohereLabs/c4ai-command-r7b-12-2024", 
+    torch_dtype=torch.float16, 
+    device_map="auto", 
+    quantization_config=bnb_config, 
+    attn_implementation="sdpa"
 )

-gen_text = tokenizer.decode(gen_tokens[0])
-print(gen_text)
+# format message with the Command-R chat template
+messages = [{"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+output = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    cache_implementation="static",
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 ## Cohere2Config
--- a/docs/source/en/model_doc/cohere2_vision.md
+++ b/docs/source/en/model_doc/cohere2_vision.md
@ -0,0 +1,123 @@
+# Command A Vision
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+</div>
+
+## Overview
+
+Command A Vision is a state-of-the-art multimodal model designed to seamlessly integrate visual and textual information for a wide range of applications. By combining advanced computer vision techniques with natural language processing capabilities, Command A Vision enables users to analyze, understand, and generate insights from both visual and textual data.
+
+The model excels at tasks including image captioning, visual question answering, document understanding, and chart understanding. This makes it a versatile tool for AI practitioners. Its ability to process complex visual and textual inputs makes it useful in settings where text-only representations are imprecise or unavailable, like real-world image understanding and graphics-heavy document processing.
+
+Command A Vision is built upon a robust architecture that leverages the latest advancements in VLMs. It's highly performant and efficient, even when dealing with large-scale datasets. The model's flexibility makes it suitable for a wide range of use cases, from content moderation and image search to medical imaging analysis and robotics.
+
+## Usage tips
+
+The model and image processor can be loaded as follows:
+
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```python
+import torch
+
+from transformers import AutoProcessor, AutoModelForImageTextToText
+
+model_id = "CohereLabs/command-a-vision-07-2025"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map="auto", torch_dtype=torch.float16
+)
+
+# Format message with the Command-A-Vision chat template
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+            },
+            {"type": "text", "text": "what is in this image?"},
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    padding=True,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+gen_tokens = model.generate(
+    **inputs,
+    max_new_tokens=300,
+    do_sample=True,
+    temperature=0.3,
+)
+
+print(
+    processor.tokenizer.decode(
+        gen_tokens[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+    )
+)
+```
+
+</hfoption>
+<hfoption id="Pipeline">
+
+```python
+from transformers import pipeline
+
+pipe = pipeline(model="CohereLabs/command-a-vision-07-2025", task="image-text-to-text", device_map="auto")
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=",
+            },
+            {"type": "text", "text": "Where was this taken ?"},
+        ],
+    },
+]
+
+outputs = pipe(text=messages, max_new_tokens=300, return_full_text=False)
+
+print(outputs)
+```
+</hfoption>
+</hfoptions>
+
+## Cohere2VisionConfig
+
+[[autodoc]] Cohere2VisionConfig
+
+## Cohere2VisionForConditionalGeneration
+
+[[autodoc]] Cohere2VisionForConditionalGeneration
+    - forward
+
+## Cohere2VisionModel
+
+[[autodoc]] Cohere2VisionModel
+    - forward
+
+## Cohere2VisionImageProcessorFast
+
+[[autodoc]] Cohere2VisionImageProcessorFast
+    - preprocess
+
+## Cohere2VisionProcessor
+
+[[autodoc]] Cohere2VisionProcessor
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@ -95,7 +95,7 @@ images = [

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to int4.

 ```python
 import requests
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -99,7 +99,7 @@ images = [

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to int4.

 ```python
 import requests
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.

 **Model Architecture:**
-CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
+CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.

 The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.

--- a/docs/source/en/model_doc/deepseek_vl.md
+++ b/docs/source/en/model_doc/deepseek_vl.md
@ -0,0 +1,224 @@
+<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# DeepseekVL
+
+[Deepseek-VL](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding images.
+
+You can find all the original Deepseek-VL checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
+
+> [!TIP]
+> Click on the Deepseek-VL models in the right sidebar for more examples of how to apply Deepseek-VL to different vision and language tasks.
+
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    task="image-text-to-text",
+    model="deepseek-community/deepseek-vl-1.3b-chat",
+    device=0,
+    torch_dtype=torch.float16
+)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+            },
+            { "type": "text", "text": "Describe this image."},
+        ]
+    }
+]
+
+pipe(text=messages, max_new_tokens=20, return_full_text=False)
+```
+</hfoption>
+
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
+
+model = DeepseekVLForConditionalGeneration.from_pretrained(
+    "deepseek-community/deepseek-vl-1.3b-chat",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
+
+messages = [
+    {
+        "role":"user",
+        "content":[
+            {
+                "type":"image",
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+            },
+            {
+                "type":"text",
+                "text":"Describe this image."
+            }
+        ]
+    }
+
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt"
+).to(model.device, dtype=model.dtype)
+
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+
+print(output_text)
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```python
+import torch
+from transformers import TorchAoConfig, DeepseekVLForConditionalGeneration, AutoProcessor
+
+quantization_config = TorchAoConfig(
+    "int4_weight_only",
+    group_size=128
+)
+
+model = DeepseekVLForConditionalGeneration.from_pretrained(
+    "deepseek-community/deepseek-vl-1.3b-chat",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+```
+### Notes
+
+- Do inference with multiple images in a single conversation.
+    ```py
+    import torch
+    from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
+
+    model = DeepseekVLForConditionalGeneration.from_pretrained(
+        "deepseek-community/deepseek-vl-1.3b-chat",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        attn_implementation="sdpa"
+    )
+
+    processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
+
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between"},
+                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+                    {"type": "text", "text": " and "},
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+                ]
+            }
+        ],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
+                    {"type": "text", "text": "What do you see in this image?"}
+                ]
+            }
+        ]
+    ]
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        padding=True,
+        truncation=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device, dtype=model.dtype)
+
+    generated_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+
+    print(output_text)
+    ```
+
+## DeepseekVLConfig
+
+[[autodoc]] DeepseekVLConfig
+
+## DeepseekVLProcessor
+
+[[autodoc]] DeepseekVLProcessor
+
+## DeepseekVLImageProcessor
+
+[[autodoc]] DeepseekVLImageProcessor
+
+## DeepseekVLImageProcessorFast
+
+[[autodoc]] DeepseekVLImageProcessorFast
+
+## DeepseekVLModel
+
+[[autodoc]] DeepseekVLModel
+    - forward
+
+## DeepseekVLForConditionalGeneration
+
+[[autodoc]] DeepseekVLForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/deepseek_vl_hybrid.md
+++ b/docs/source/en/model_doc/deepseek_vl_hybrid.md
@ -0,0 +1,223 @@
+<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# DeepseekVLHybrid
+
+[Deepseek-VL-Hybrid](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding low-resolution images and [SAM (Segment Anything Model)](./sam) is incorporated to handle high-resolution image encoding, enhancing the model’s ability to process fine-grained visual details. Deepseek-VL-Hybrid is a variant of Deepseek-VL that uses [SAM (Segment Anything Model)](./sam) to handle high-resolution image encoding.
+
+You can find all the original Deepseek-VL-Hybrid checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
+
+> [!TIP]
+> Click on the Deepseek-VL-Hybrid models in the right sidebar for more examples of how to apply Deepseek-VL-Hybrid to different vision and language tasks.
+
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    task="image-text-to-text",
+    model="deepseek-community/deepseek-vl-7b-chat",
+    device=0,
+    torch_dtype=torch.float16
+)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+            },
+            { "type": "text", "text": "Describe this image."},
+        ]
+    }
+]
+
+pipe(text=messages, max_new_tokens=20, return_full_text=False)
+```
+</hfoption>
+
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
+
+model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+    "deepseek-community/deepseek-vl-7b-chat",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
+
+messages = [
+    {
+        "role":"user",
+        "content":[
+            {
+                "type":"image",
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+            },
+            {
+                "type":"text",
+                "text":"Describe this image."
+            }
+        ]
+    }
+
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt"
+).to(model.device, dtype=model.dtype)
+
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+
+print(output_text)
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```python
+import torch
+from transformers import TorchAoConfig, DeepseekVLHybridForConditionalGeneration, AutoProcessor
+
+quantization_config = TorchAoConfig(
+    "int4_weight_only",
+    group_size=128
+)
+
+model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+    "deepseek-community/deepseek-vl-7b-chat",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+```
+### Notes
+
+- Do inference with multiple images in a single conversation.
+    ```py
+    import torch
+    from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
+
+    model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+        "deepseek-community/deepseek-vl-7b-chat",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        attn_implementation="sdpa"
+    )
+
+    processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
+
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between"},
+                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+                    {"type": "text", "text": " and "},
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+                ]
+            }
+        ],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
+                    {"type": "text", "text": "What do you see in this image?"}
+                ]
+            }
+        ]
+    ]
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        padding=True,
+        truncation=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device, dtype=model.dtype)
+
+    generated_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+
+    print(output_text)
+    ```
+
+## DeepseekVLHybridConfig
+
+[[autodoc]] DeepseekVLHybridConfig
+
+## DeepseekVLHybridProcessor
+
+[[autodoc]] DeepseekVLHybridProcessor
+
+## DeepseekVLHybridImageProcessor
+
+[[autodoc]] DeepseekVLHybridImageProcessor
+
+## DeepseekVLHybridImageProcessorFast
+
+[[autodoc]] DeepseekVLHybridImageProcessorFast
+
+## DeepseekVLHybridModel
+
+[[autodoc]] DeepseekVLHybridModel
+    - forward
+
+## DeepseekVLHybridForConditionalGeneration
+
+[[autodoc]] DeepseekVLHybridForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@ -14,132 +14,122 @@ rendered properly in your Markdown viewer.

 -->

-# DETR
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+	<div class="flex flex-wrap space-x-1">
+		<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+	</div>
 </div>

-## Overview
+# DETR

-The DETR model was proposed in [End-to-End Object Detection with Transformers](https://huggingface.co/papers/2005.12872) by
-Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov and Sergey Zagoruyko. DETR
-consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for
-object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use
-things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be
-naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs.
+[DETR](https://huggingface.co/papers/2005.12872) consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs.

-The abstract from the paper is the following:
+You can find all the original DETR checkpoints under the [AI at Meta](https://huggingface.co/facebook/models?search=detr) organization.

-*We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the
-detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression
-procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the
-new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via
-bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries,
-DETR reasons about the relations of the objects and the global image context to directly output the final set of
-predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many
-other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and
-highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily
-generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive
-baselines.*
+> [!TIP]
+> This model was contributed by [nielsr](https://huggingface.co/nielsr).
+>
+> Click on the DETR models in the right sidebar for more examples of how to apply DETR to different object detection and segmentation tasks.

-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/detr).
+The example below demonstrates how to perform object detection with the [`Pipeline`] or the [`AutoModel`] class.

-## How DETR works
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```python
+from transformers import pipeline
+import torch
+
+pipeline = pipeline(
+    "object-detection", 
+    model="facebook/detr-resnet-50",
+    torch_dtype=torch.float16,
+    device_map=0
+)
+
+pipeline("http://images.cocodataset.org/val2017/000000039769.jpg")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+from transformers import AutoImageProcessor, AutoModelForObjectDetection
+from PIL import Image
+import requests
+import torch
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+
+# prepare image for the model
+inputs = image_processor(images=image, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
+
+for result in results:
+    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
+        score, label = score.item(), label_id.item()
+        box = [round(i, 2) for i in box.tolist()]
+        print(f"{model.config.id2label[label]}: {score:.2f} {box}")
+```
+
+</hfoption>
+</hfoptions>
+
+<details>
+<summary>How DETR works</summary>

 Here's a TLDR explaining how [`~transformers.DetrForObjectDetection`] works:

-First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use
-ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a
-tensor of shape `(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone
-outputs a new lower-resolution feature map, typically of shape `(batch_size, 2048, height/32, width/32)`. This is
-then projected to match the hidden dimension of the Transformer of DETR, which is `256` by default, using a
-`nn.Conv2D` layer. So now, we have a tensor of shape `(batch_size, 256, height/32, width/32).` Next, the
-feature map is flattened and transposed to obtain a tensor of shape `(batch_size, seq_len, d_model)` =
-`(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually
-longer than usual, but with a smaller `d_model` (which in NLP is typically 768 or higher).
+First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a tensor of shape `(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone outputs a new lower-resolution feature map, typically of shape `(batch_size, 2048, height/32, width/32)`. This is then projected to match the hidden dimension of the Transformer of DETR, which is `256` by default, using a `nn.Conv2D` layer. So now, we have a tensor of shape `(batch_size, 256, height/32, width/32).` Next, the feature map is flattened and transposed to obtain a tensor of shape `(batch_size, seq_len, d_model)` = `(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually longer than usual, but with a smaller `d_model` (which in NLP is typically 768 or higher).

-Next, this is sent through the encoder, outputting `encoder_hidden_states` of the same shape (you can consider
-these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape
-`(batch_size, num_queries, d_model)`, with `num_queries` typically set to 100 and initialized with zeros.
-These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to
-the encoder, they are added to the input of each attention layer. Each object query will look for a particular object
-in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers
-to output `decoder_hidden_states` of the same shape: `(batch_size, num_queries, d_model)`. Next, two heads
-are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no
-object", and a MLP to predict bounding boxes for each query.
+Next, this is sent through the encoder, outputting `encoder_hidden_states` of the same shape (you can consider these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape `(batch_size, num_queries, d_model)`, with `num_queries` typically set to 100 and initialized with zeros. These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to the encoder, they are added to the input of each attention layer. Each object query will look for a particular object in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers to output `decoder_hidden_states` of the same shape: `(batch_size, num_queries, d_model)`. Next, two heads are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no object", and a MLP to predict bounding boxes for each query.

-The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes +
-bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N
-(so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as
-bounding box). The [Hungarian matching algorithm](https://en.wikipedia.org/wiki/Hungarian_algorithm) is used to find
-an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for
-the classes) and a linear combination of the L1 and [generalized IoU loss](https://giou.stanford.edu/) (for the
-bounding boxes) are used to optimize the parameters of the model.
+The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes + bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N (so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as bounding box). The [Hungarian matching algorithm](https://en.wikipedia.org/wiki/Hungarian_algorithm) is used to find an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for the classes) and a linear combination of the L1 and [generalized IoU loss](https://giou.stanford.edu/) (for the bounding boxes) are used to optimize the parameters of the model.

-DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance
-segmentation). [`~transformers.DetrForSegmentation`] adds a segmentation mask head on top of
-[`~transformers.DetrForObjectDetection`]. The mask head can be trained either jointly, or in a two steps process,
-where one first trains a [`~transformers.DetrForObjectDetection`] model to detect bounding boxes around both
-"things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only
-the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is
-required for the training to be possible, since the Hungarian matching is computed using distances between boxes.
+DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance segmentation). [`~transformers.DetrForSegmentation`] adds a segmentation mask head on top of [`~transformers.DetrForObjectDetection`]. The mask head can be trained either jointly, or in a two steps process, where one first trains a [`~transformers.DetrForObjectDetection`] model to detect bounding boxes around both "things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is required for the training to be possible, since the Hungarian matching is computed using distances between boxes.

-## Usage tips
+</details>

- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum
-  number of objects that can be detected in a single image, and is set to 100 by default (see parameter
-  `num_queries` of [`~transformers.DetrConfig`]). Note that it's good to have some slack (in COCO, the
-  authors used 100, while the maximum number of objects in a COCO image is ~70).
- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2,
-  which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting
-  to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned
-  absolute position embeddings. By default, the parameter `position_embedding_type` of
-  [`~transformers.DetrConfig`] is set to `"sine"`.
- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help
-  the model output the correct number of objects of each class. If you set the parameter `auxiliary_loss` of
-  [`~transformers.DetrConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses
-  are added after each decoder layer (with the FFNs sharing parameters).
- If you want to train the model in a distributed environment across multiple nodes, then one should update the
-  _num_boxes_ variable in the _DetrLoss_ class of _modeling_detr.py_. When training on multiple nodes, this should be
-  set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
- [`~transformers.DetrForObjectDetection`] and [`~transformers.DetrForSegmentation`] can be initialized with
-  any convolutional backbone available in the [timm library](https://github.com/rwightman/pytorch-image-models).
-  Initializing with a MobileNet backbone for example can be done by setting the `backbone` attribute of
-  [`~transformers.DetrConfig`] to `"tf_mobilenetv3_small_075"`, and then initializing the model with that
-  config.
- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is
-  at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at
-  least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use
-  [`~transformers.DetrImageProcessor`] to prepare images (and optional annotations in COCO format) for the
-  model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the
-  largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding.
-  Alternatively, one can also define a custom `collate_fn` in order to batch images together, using
-  [`~transformers.DetrImageProcessor.pad_and_create_pixel_mask`].
- The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`.
-  It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.
+## Notes

-There are three ways to instantiate a DETR model (depending on what you prefer):
+- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum number of objects that can be detected in a single image, and is set to 100 by default (see parameter `num_queries` of [`~transformers.DetrConfig`]). Note that it's good to have some slack (in COCO, the authors used 100, while the maximum number of objects in a COCO image is ~70).
+- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2, which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
+- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned absolute position embeddings. By default, the parameter `position_embedding_type` of [`~transformers.DetrConfig`] is set to `"sine"`.
+- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help the model output the correct number of objects of each class. If you set the parameter `auxiliary_loss` of [`~transformers.DetrConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses are added after each decoder layer (with the FFNs sharing parameters).
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the _num_boxes_ variable in the _DetrLoss_ class of _modeling_detr.py_. When training on multiple nodes, this should be set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
+- [`~transformers.DetrForObjectDetection`] and [`~transformers.DetrForSegmentation`] can be initialized with any convolutional backbone available in the [timm library](https://github.com/rwightman/pytorch-image-models). Initializing with a MobileNet backbone for example can be done by setting the `backbone` attribute of [`~transformers.DetrConfig`] to `"tf_mobilenetv3_small_075"`, and then initializing the model with that config.
+- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use [`~transformers.DetrImageProcessor`] to prepare images (and optional annotations in COCO format) for the model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding. Alternatively, one can also define a custom `collate_fn` in order to batch images together, using [`~transformers.DetrImageProcessor.pad_and_create_pixel_mask`].
+- The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`. It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.

-Option 1: Instantiate DETR with pre-trained weights for entire model
-```py
->>> from transformers import DetrForObjectDetection
+There are three other ways to instantiate a DETR model (depending on what you prefer):

->>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+- Option 1: Instantiate DETR with pre-trained weights for entire model
+```python
+from transformers import DetrForObjectDetection
+
+model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
 ```

-Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
-```py
->>> from transformers import DetrConfig, DetrForObjectDetection
+- Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
+```python
+from transformers import DetrConfig, DetrForObjectDetection

->>> config = DetrConfig()
->>> model = DetrForObjectDetection(config)
+config = DetrConfig()
+model = DetrForObjectDetection(config)
 ```
-Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer
-```py
->>> config = DetrConfig(use_pretrained_backbone=False)
->>> model = DetrForObjectDetection(config)
+
+- Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer
+```python
+config = DetrConfig(use_pretrained_backbone=False)
+model = DetrForObjectDetection(config)
 ```

 As a summary, consider the following table:
@ -153,24 +143,12 @@ As a summary, consider the following table:
 | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
 | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |

-In short, one should prepare the data either in COCO detection or COCO panoptic format, then use
-[`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional
-`labels`, which can then be used to train (or fine-tune) a model. For evaluation, one should first convert the
-outputs of the model using one of the postprocessing methods of [`~transformers.DetrImageProcessor`]. These can
-be provided to either `CocoEvaluator` or `PanopticEvaluator`, which allow you to calculate metrics like
-mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the [original repository](https://github.com/facebookresearch/detr). See the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for more info regarding evaluation.
+- In short, one should prepare the data either in COCO detection or COCO panoptic format, then use [`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional `labels`, which can then be used to train (or fine-tune) a model. 
+- For evaluation, one should first convert the outputs of the model using one of the postprocessing methods of [`~transformers.DetrImageProcessor`]. These can be provided to either `CocoEvaluator` or `PanopticEvaluator`, which allow you to calculate metrics like mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the [original repository](https://github.com/facebookresearch/detr). See the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for more info regarding evaluation.

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETR.
-
-<PipelineTag pipeline="object-detection"/>
-
- All example notebooks illustrating fine-tuning [`DetrForObjectDetection`] and [`DetrForSegmentation`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR).
- Scripts for finetuning [`DetrForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
- See also: [Object detection task guide](../tasks/object_detection).
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+- Refer to these [notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for examples of fine-tuning [`DetrForObjectDetection`] and [`DetrForSegmentation`] on a custom dataset.

 ## DetrConfig

--- a/docs/source/en/model_doc/dia.md
+++ b/docs/source/en/model_doc/dia.md
@ -26,14 +26,14 @@ rendered properly in your Markdown viewer.

 ## Overview

-Dia is an opensource text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
-It can generate highly realistic dialogue from transcript including nonverbal communications such as laughter and coughing.
+Dia is an open-source text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
+It can generate highly realistic dialogue from transcript including non-verbal communications such as laughter and coughing.
 Furthermore, emotion and tone control is also possible via audio conditioning (voice cloning).

 **Model Architecture:**
 Dia is an encoder-decoder transformer based on the original transformer architecture. However, some more modern features such as
 rotational positional embeddings (RoPE) are also included. For its text portion (encoder), a byte tokenizer is utilized while
-for the audio portion (decoder), a pretrained codec model [DAC](./dac.md) is used - DAC encodes speech into discrete codebook
+for the audio portion (decoder), a pretrained codec model [DAC](./dac) is used - DAC encodes speech into discrete codebook
 tokens and decodes them back into audio.

 ## Usage Tips
--- a/docs/source/en/model_doc/efficientloftr.md
+++ b/docs/source/en/model_doc/efficientloftr.md
@ -0,0 +1,149 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the MIT License; you may not use this file except in compliance with
+the License.
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
+# EfficientLoFTR
+
+[EfficientLoFTR](https://huggingface.co/papers/2403.04765) is an efficient detector-free local feature matching method that produces semi-dense matches across images with sparse-like speed. It builds upon the original [LoFTR](https://huggingface.co/papers/2104.00680) architecture but introduces significant improvements for both efficiency and accuracy. The key innovation is an aggregated attention mechanism with adaptive token selection that makes the model ~2.5× faster than LoFTR while achieving higher accuracy. EfficientLoFTR can even surpass state-of-the-art efficient sparse matching pipelines like [SuperPoint](./superpoint) + [LightGlue](./lightglue) in terms of speed, making it suitable for large-scale or latency-sensitive applications such as image retrieval and 3D reconstruction.
+
+> [!TIP]
+> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+>
+> Click on the EfficientLoFTR models in the right sidebar for more examples of how to apply EfficientLoFTR to different computer vision tasks.
+
+The example below demonstrates how to match keypoints between two images with the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```py
+from transformers import AutoImageProcessor, AutoModelForKeypointMatching
+import torch
+from PIL import Image
+import requests
+
+url_image1 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg"
+image1 = Image.open(requests.get(url_image1, stream=True).raw)
+url_image2 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg"
+image2 = Image.open(requests.get(url_image2, stream=True).raw)
+
+images = [image1, image2]
+
+processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
+model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")
+
+inputs = processor(images, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Post-process to get keypoints and matches
+image_sizes = [[(image.height, image.width) for image in images]]
+processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+```
+
+</hfoption>
+</hfoptions>
+
+## Notes
+
+- EfficientLoFTR is designed for efficiency while maintaining high accuracy. It uses an aggregated attention mechanism with adaptive token selection to reduce computational overhead compared to the original LoFTR.
+
+    ```py
+    from transformers import AutoImageProcessor, AutoModelForKeypointMatching
+    import torch
+    from PIL import Image
+    import requests
+    
+    processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
+    model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")
+    
+    # EfficientLoFTR requires pairs of images
+    images = [image1, image2]
+    inputs = processor(images, return_tensors="pt")
+    outputs = model(**inputs)
+    
+    # Extract matching information
+    keypoints = outputs.keypoints        # Keypoints in both images
+    matches = outputs.matches            # Matching indices 
+    matching_scores = outputs.matching_scores  # Confidence scores
+    ```
+
+- The model produces semi-dense matches, offering a good balance between the density of matches and computational efficiency. It excels in handling large viewpoint changes and texture-poor scenarios.
+
+- For better visualization and analysis, use the [`~EfficientLoFTRImageProcessor.post_process_keypoint_matching`] method to get matches in a more readable format.
+
+    ```py
+    # Process outputs for visualization
+    image_sizes = [[(image.height, image.width) for image in images]]
+    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+    
+    for i, output in enumerate(processed_outputs):
+        print(f"For the image pair {i}")
+        for keypoint0, keypoint1, matching_score in zip(
+                output["keypoints0"], output["keypoints1"], output["matching_scores"]
+        ):
+            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
+    ```
+
+- Visualize the matches between the images using the built-in plotting functionality.
+
+    ```py
+    # Easy visualization using the built-in plotting method
+    visualized_images = processor.visualize_keypoint_matching(images, processed_outputs)
+    ```
+
+- EfficientLoFTR uses a novel two-stage correlation layer that achieves accurate subpixel correspondences, improving upon the original LoFTR's fine correlation module.
+
+<div class="flex justify-center">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/2nJZQlFToCYp_iLurvcZ4.png">
+</div>
+
+## Resources
+
+- Refer to the [original EfficientLoFTR repository](https://github.com/zju3dv/EfficientLoFTR) for more examples and implementation details.
+- [EfficientLoFTR project page](https://zju3dv.github.io/efficientloftr/) with interactive demos and additional information.
+
+## EfficientLoFTRConfig
+
+[[autodoc]] EfficientLoFTRConfig
+
+## EfficientLoFTRImageProcessor
+
+[[autodoc]] EfficientLoFTRImageProcessor
+
+- preprocess
+- post_process_keypoint_matching
+- visualize_keypoint_matching
+
+<frameworkcontent>
+<pt>
+## EfficientLoFTRModel
+
+[[autodoc]] EfficientLoFTRModel
+
+- forward
+
+## EfficientLoFTRForKeypointMatching
+
+[[autodoc]] EfficientLoFTRForKeypointMatching
+
+- forward
+
+</pt>
+</frameworkcontent>
--- a/docs/source/en/model_doc/encodec.md
+++ b/docs/source/en/model_doc/encodec.md
@ -47,7 +47,8 @@ Here is a quick example of how to encode and decode an audio using this model:
 >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

 >>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
->>> audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]
+>>> # `encoder_outputs.audio_codes` contains discrete codes
+>>> audio_values = model.decode(**encoder_outputs, padding_mask=inputs["padding_mask"])[0]
 >>> # or the equivalent with a forward pass
 >>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
 ```
--- a/docs/source/en/model_doc/ernie.md
+++ b/docs/source/en/model_doc/ernie.md
@ -14,29 +14,83 @@ rendered properly in your Markdown viewer.

 -->

-# ERNIE
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
 </div>

-## Overview
-ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks,
-including [ERNIE1.0](https://huggingface.co/papers/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
-[ERNIE3.0](https://huggingface.co/papers/2107.02137), [ERNIE-Gram](https://huggingface.co/papers/2010.12148), [ERNIE-health](https://huggingface.co/papers/2110.07244), etc.
+# ERNIE

-These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).
+[ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
+[ERNIE3.0](https://arxiv.org/abs/2107.02137), [ERNIE-Gram](https://arxiv.org/abs/2010.12148), [ERNIE-health](https://arxiv.org/abs/2110.07244) are a series of powerful models proposed by baidu, especially in Chinese tasks.

-### Usage example
-Take `ernie-1.0-base-zh` as an example:
+ERNIE (Enhanced Representation through kNowledge IntEgration) is designed to learn language representation enhanced by knowledge masking strategies, which includes entity-level masking and phrase-level masking.

-```Python
-from transformers import AutoTokenizer, AutoModel
-tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
-model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh")
+Other ERNIE models released by baidu can be found at [Ernie 4.5](./ernie4_5), and [Ernie 4.5 MoE](./ernie4_5_moe).
+
+> [!TIP]
+> This model was contributed by [nghuyong](https://huggingface.co/nghuyong), and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).
+>
+> Click on the ERNIE models in the right sidebar for more examples of how to apply ERNIE to different language tasks.
+
+The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="fill-mask",
+    model="nghuyong/ernie-3.0-xbase-zh"
+)
+
+pipeline("巴黎是[MASK]国的首都。")
 ```

-### Model checkpoints
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "nghuyong/ernie-3.0-xbase-zh",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "nghuyong/ernie-3.0-xbase-zh",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+inputs = tokenizer("巴黎是[MASK]国的首都。", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "巴黎是[MASK]国的首都。" | transformers run --task fill-mask --model nghuyong/ernie-3.0-xbase-zh --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## Notes
+
+Model variants are available in different sizes and languages.

 |     Model Name      | Language |           Description           |
 |:-------------------:|:--------:|:-------------------------------:|
@ -51,18 +105,11 @@ model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh")
 |   ernie-health-zh   | Chinese  | Layer:12, Heads:12, Hidden:768  |
 |    ernie-gram-zh    | Chinese  | Layer:12, Heads:12, Hidden:768  |

-You can find all the supported models from huggingface's model hub: [huggingface.co/nghuyong](https://huggingface.co/nghuyong), and model details from paddle's official
-repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html)
-and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro).
-
 ## Resources

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
+You can find all the supported models from huggingface's model hub: [huggingface.co/nghuyong](https://huggingface.co/nghuyong), and model details from paddle's official
+repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html)
+and [ERNIE's legacy branch](https://github.com/PaddlePaddle/ERNIE/tree/legacy/develop).

 ## ErnieConfig

@ -116,4 +163,4 @@ and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro).
 ## ErnieForQuestionAnswering

 [[autodoc]] ErnieForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/ernie4_5.md
+++ b/docs/source/en/model_doc/ernie4_5.md
@ -29,9 +29,9 @@ rendered properly in your Markdown viewer.

 The Ernie 4.5 model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
 This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
-model without mixture of experts (moe) with 0.3B parameters in total. It uses the standard [Llama](./llama.md) at its core.
+model without mixture of experts (moe) with 0.3B parameters in total. It uses the standard [Llama](./llama) at its core.

-Other models from the family can be found at [Ernie 4.5 MoE](./ernie4_5_moe.md).
+Other models from the family can be found at [Ernie 4.5 Moe](./ernie4_5_moe).

 <div class="flex justify-center">
    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
--- a/docs/source/en/model_doc/ernie4_5_moe.md
+++ b/docs/source/en/model_doc/ernie4_5_moe.md
@ -23,17 +23,17 @@ rendered properly in your Markdown viewer.
    </div>
 </div>

-# Ernie 4.5 MoE
+# Ernie 4.5 Moe

 ## Overview

-The Ernie 4.5 MoE model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
+The Ernie 4.5 Moe model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
 This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
 model with mixture of experts (moe) - one with 21B total, 3B active parameters and another one with 300B total, 47B active parameters.
-It uses the standard [Llama](./llama.md) at its core combined with a specialized MoE based on [Mixtral](./mixtral.md) with additional shared
+It uses the standard [Llama](./llama) at its core combined with a specialized MoE based on [Mixtral](./mixtral) with additional shared
 experts.

-Other models from the family can be found at [Ernie 4.5](./ernie4_5.md).
+Other models from the family can be found at [Ernie 4.5](./ernie4_5).

 <div class="flex justify-center">
    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
@ -167,17 +167,17 @@ This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
 The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).


-## Ernie4_5_MoEConfig
+## Ernie4_5_MoeConfig

-[[autodoc]] Ernie4_5_MoEConfig
+[[autodoc]] Ernie4_5_MoeConfig

-## Ernie4_5_MoEModel
+## Ernie4_5_MoeModel

-[[autodoc]] Ernie4_5_MoEModel
+[[autodoc]] Ernie4_5_MoeModel
    - forward

-## Ernie4_5_MoEForCausalLM
+## Ernie4_5_MoeForCausalLM

-[[autodoc]] Ernie4_5_MoEForCausalLM
+[[autodoc]] Ernie4_5_MoeForCausalLM
    - forward
    - generate
--- a/docs/source/en/model_doc/evolla.md
+++ b/docs/source/en/model_doc/evolla.md
@ -0,0 +1,95 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Evolla
+
+## Overview
+
+The Evolla model was proposed in [Decoding the Molecular Language of Proteins with Evolla](https://doi.org/10.1101/2025.01.05.630192) by [Zhou et al.](https://doi.org/10.1101/2025.01.05.630192).
+
+Evolla is an advanced 80-billion-parameter protein-language generative model designed to decode the molecular language of proteins. It integrates information from protein sequences, structures, and user queries to generate precise and contextually nuanced insights into protein function. Trained on an unprecedented AI-generated dataset of 546 million protein question-answer pairs and 150 billion word tokens, Evolla significantly advances research in proteomics and functional genomics, providing expert-level insights and shedding light on the molecular logic encoded in proteins.
+
+The abstract from the paper is the following:
+
+*Proteins, nature’s intricate molecular machines, are the products of billions of years of evolution and play fundamental roles in sustaining life. Yet, deciphering their molecular language - that is, understanding how protein sequences and structures encode and determine biological functions - remains a corner-stone challenge in modern biology. Here, we introduce Evolla, an 80 billion frontier protein-language generative model designed to decode the molecular language of proteins. By integrating information from protein sequences, structures, and user queries, Evolla generates precise and contextually nuanced insights into protein function. A key innovation of Evolla lies in its training on an unprecedented AI-generated dataset: 546 million protein question-answer pairs and 150 billion word tokens, designed to reflect the immense complexity and functional diversity of proteins. Post-pretraining, Evolla integrates Direct Preference Optimization (DPO) to refine the model based on preference signals and Retrieval-Augmented Generation (RAG) for external knowledge incorporation, improving response quality and relevance. To evaluate its performance, we propose a novel framework, Instructional Response Space (IRS), demonstrating that Evolla delivers expert-level insights, advancing research in proteomics and functional genomics while shedding light on the molecular logic encoded in proteins. The online demo is available at http://www.chat-protein.com/.*
+
+Examples:
+
+```python
+processor = EvollaProcessor.from_pretrained("westlake-repl/Evolla-10B-DPO-hf")
+model = EvollaForProteinText2Text.from_pretrained("westlake-repl/Evolla-10B-DPO-hf")
+# aa_seq should have same length as foldseek
+protein_inputs = [
+    {
+        
+        "aa_seq": "MATGGRRG...",
+        "foldseek": "###lqpfd...", # hashtag means the low-confidence foldseek tokens
+    },
+    {
+        "aa_seq": "MLPGLALL...",
+        "foldseek": "dfwwkwad...",
+    }
+]
+message_list = [
+    [
+        {
+            "role": "system",
+            "content": "You are an AI expert that can answer any questions about protein.",
+        },
+        {"role": "user", "content": "What is the function of this protein?"},
+    ],
+    [
+        {
+            "role": "system",
+            "content": "You are an AI expert that can answer any questions about protein.",
+        },
+        {"role": "user", "content": "What is the function of this protein?"},
+    ]
+]
+input_dict = processor(
+    protein_informations, messages_list, return_tensors="pt", text_max_length=512, protein_max_length=1024
+)
+with torch.no_grad():
+    generated_ids = hf_model.generate(**input_dict)
+generated_texts = processor.batch_decode(
+    generated_ids, skip_special_tokens=True
+)
+```
+
+Tips:
+
+- This model was contributed by [Xibin Bayes Zhou](https://huggingface.co/XibinBayesZhou).
+- The original code can be found [here](https://github.com/westlake-repl/Evolla).
+
+
+## EvollaConfig
+
+[[autodoc]] EvollaConfig
+
+## EvollaModel
+
+[[autodoc]] EvollaModel
+    - forward
+
+## EvollaForProteinText2Text
+
+[[autodoc]] EvollaForProteinText2Text
+    - forward
+
+## EvollaProcessor
+
+[[autodoc]] EvollaProcessor
+    - __call__
--- a/docs/source/en/model_doc/exaone4.md
+++ b/docs/source/en/model_doc/exaone4.md
@ -0,0 +1,208 @@
+<!--Copyright 2025 The LG AI Research and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# EXAONE 4
+
+## Overview
+
+**[EXAONE 4.0](https://github.com/LG-AI-EXAONE/EXAONE-4.0)** model is the language model, which integrates a **Non-reasoning mode** and **Reasoning mode** to achieve both the excellent usability of [EXAONE 3.5](https://github.com/LG-AI-EXAONE/EXAONE-3.5) and the advanced reasoning abilities of [EXAONE Deep](https://github.com/LG-AI-EXAONE/EXAONE-Deep). To pave the way for the agentic AI era, EXAONE 4.0 incorporates essential features such as agentic tool use, and its multilingual capabilities are extended
+to support Spanish in addition to English and Korean. 
+
+The EXAONE 4.0 model series consists of two sizes: a mid-size **32B** model optimized for high performance, and a small-size **1.2B** model designed for on-device applications.
+
+In the EXAONE 4.0 architecture, we apply new architectural changes compared to previous EXAONE models as below:
+
+1. **Hybrid Attention**: For the 32B model, we adopt hybrid attention scheme, which combines *Local attention (sliding window attention)* with *Global attention (full attention)* in a 3:1 ratio. We do not use RoPE (Rotary Positional Embedding) for global attention for better global context understanding.
+2. **QK-Reorder-Norm**: We reorder the LayerNorm position from the traditional Pre-LN scheme by applying LayerNorm directly to the attention and MLP outputs, and we add RMS normalization right after the Q and K projection. It helps yield better performance on downstream tasks despite consuming more computation.
+
+For more details, please refer to our [technical report](https://arxiv.org/abs/2507.11407), [HuggingFace paper](https://huggingface.co/papers/2507.11407), [blog](https://www.lgresearch.ai/blog/view?seq=576), and [GitHub](https://github.com/LG-AI-EXAONE/EXAONE-4.0).
+
+All model weights including quantized versions are available at [Huggingface Collections](https://huggingface.co/collections/LGAI-EXAONE/exaone-40-686b2e0069800c835ed48375).
+
+
+## Model Details
+
+### Model Specifications
+
+| Model Configuration | 32B | 1.2B |
+|:-------------------|:-----:|:------:|
+| d_model | 5,120 | 2,048 |
+| Number of layers | 64 | 30 |
+| Normalization | QK-Reorder-LN | QK-Reorder-LN |
+| Non-linearity | SwiGLU | SwiGLU |
+| Feedforward dimension | 27,392 | 4,096 |
+| Attention type | Hybrid (3:1 Local-Global) | Global |
+| Head type | GQA | GQA |
+| Number of heads | 40 | 32 |
+| Number of KV heads | 8 | 8 |
+| Head size | 128 | 64 |
+| Max sequence length | 131,072 | 65,536 |
+| RoPE theta | 1,000,000 | 1,000,000 |
+| Tokenizer | BBPE | BBPE |
+| Vocab size | 102,400 | 102,400 |
+| Tied word embedding | False | True |
+| Knowledge cut-off | Nov. 2024 | Nov. 2024 |
+
+
+## Usage tips
+
+### Non-reasoning mode
+
+For general use, you can use the EXAONE 4.0 models with the following example:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "LGAI-EXAONE/EXAONE-4.0-32B"
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="bfloat16",
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+# choose your prompt
+prompt = "Explain how wonderful you are"
+prompt = "Explica lo increíble que eres"
+prompt = "너가 얼마나 대단한지 설명해 봐"
+
+messages = [
+    {"role": "user", "content": prompt}
+]
+input_ids = tokenizer.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt"
+)
+
+output = model.generate(
+    input_ids.to(model.device),
+    max_new_tokens=128,
+    do_sample=False,
+)
+print(tokenizer.decode(output[0]))
+```
+
+### Reasoning mode
+
+The EXAONE 4.0 models have reasoning capabilities for handling complex problems. You can activate reasoning mode by using the `enable_thinking=True` argument with the tokenizer, which opens a reasoning block that starts with `<think>` tag without closing it.
+
+```python
+messages = [
+    {"role": "user", "content": "Which one is bigger, 3.12 vs 3.9?"}
+]
+input_ids = tokenizer.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    enable_thinking=True,
+)
+
+output = model.generate(
+    input_ids.to(model.device),
+    max_new_tokens=128,
+    do_sample=True,
+    temperature=0.6,
+    top_p=0.95
+)
+print(tokenizer.decode(output[0]))
+```
+
+> [!IMPORTANT]
+> The model generation with reasoning mode can be affected sensitively by sampling parameters, so please refer to the [Usage Guideline](https://github.com/LG-AI-EXAONE/EXAONE-4.0#usage-guideline) on official GitHub page for better quality.
+
+### Agentic tool use
+
+The EXAONE 4.0 models can be used as agents with their tool calling capabilities. You can provide tool schemas to the model for effective tool calling.
+
+```python
+import random
+
+def roll_dice(max_num: int):
+    return random.randint(1, max_num)
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "roll_dice",
+            "description": "Roll a dice with the number 1 to N. User can select the number N.",
+            "parameters": {
+                "type": "object",
+                "required": ["max_num"],
+                "properties": {
+                    "max_num": {
+                        "type": "int",
+                        "description": "Max number of the dice"
+                    }
+                }
+            }
+        }
+    }
+]
+
+messages = [
+    {"role": "user", "content": "Roll D6 dice twice!"}
+]
+input_ids = tokenizer.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    tools=tools,
+)
+
+output = model.generate(
+    input_ids.to(model.device),
+    max_new_tokens=1024,
+    do_sample=True,
+    temperature=0.6,
+    top_p=0.95,
+)
+print(tokenizer.decode(output[0]))
+```
+
+## Exaone4Config
+
+[[autodoc]] Exaone4Config
+
+## Exaone4Model
+
+[[autodoc]] Exaone4Model
+    - forward
+
+## Exaone4ForCausalLM
+
+[[autodoc]] Exaone4ForCausalLM
+    - forward
+
+## Exaone4ForSequenceClassification
+
+[[autodoc]] Exaone4ForSequenceClassification
+    - forward
+
+## Exaone4ForTokenClassification
+
+[[autodoc]] Exaone4ForTokenClassification
+    - forward
+
+## Exaone4ForQuestionAnswering
+
+[[autodoc]] Exaone4ForQuestionAnswering
+    - forward
--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@ -30,7 +30,7 @@ Gemma3n is a multimodal model with pretrained and instruction-tuned variants, av
 large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
 this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
 [MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
-a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
+a similar attention pattern to [Gemma 3](./gemma3) with alternating 4 local sliding window self-attention layers for
 every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
 [MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
 trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.
--- a/docs/source/en/model_doc/gpt_oss.md
+++ b/docs/source/en/model_doc/gpt_oss.md
@ -0,0 +1,58 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# GptOss
+
+## Overview
+
+The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## GptOssConfig
+
+[[autodoc]] GptOssConfig
+
+## GptOssModel
+
+[[autodoc]] GptOssModel
+    - forward
+
+## GptOssForCausalLM
+
+[[autodoc]] GptOssForCausalLM
+    - forward
--- a/docs/source/en/model_doc/granitemoehybrid.md
+++ b/docs/source/en/model_doc/granitemoehybrid.md
@ -48,6 +48,32 @@ for i in output:

 This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co/SukritiSharma) and [Alexander Brooks](https://huggingface.co/abrooks9944).

+## Notes
+
+- `GraniteMoeHybridForCausalLM` supports padding-free training which concatenates distinct training examples while still processing inputs as separate batches. It can significantly accelerate inference by [~2x](https://github.com/huggingface/transformers/pull/35861#issue-2807873129) (depending on model and data distribution) and reduce memory-usage if there are examples of varying lengths by avoiding unnecessary compute and memory overhead from padding tokens.
+
+  Padding-free training requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d` packages and the following arguments must be passed to the model in addition to `input_ids` and `labels`.
+
+  - `position_ids: torch.LongTensor`: the position index of each token in each sequence.
+  - `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
+  - Each of the [`FlashAttentionKwargs`]
+    - `cu_seq_lens_q: torch.LongTensor`: the cumulative sequence lengths of all queries.
+    - `cu_seq_lens_k: torch.LongTensor`: the cumulative sequence lengths of all keys.
+    - `max_length_q: int`: the longest query length in the batch.
+    - `max_length_k: int`: the longest key length in the batch.
+
+  The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] programmatically generates the set of additional arguments above using `return_seq_idx=True` and `return_flash_attn_kwargs=True`. See the [Improving Hugging Face Training Efficiency Through Packing with Flash Attention](https://huggingface.co/blog/packing-with-FA2) blog post for additional information.
+
+  ```python
+  from transformers import DataCollatorWithFlattening
+
+  # Example of using padding-free training
+  data_collator = DataCollatorWithFlattening(
+      tokenizer=tokenizer,
+      return_seq_idx=True,
+      return_flash_attn_kwargs=True
+  )
+  ```

 ## GraniteMoeHybridConfig

@ -61,4 +87,4 @@ This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co
 ## GraniteMoeHybridForCausalLM

 [[autodoc]] GraniteMoeHybridForCausalLM
-    - forward
+    - forward
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@ -169,9 +169,9 @@ model = Idefics2ForConditionalGeneration.from_pretrained(

 ## Shrinking down Idefics2 using quantization

-As the Idefics2 model has 8 billion parameters, that would require about 16GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), that requires only about 3.5GB of RAM.
+As the Idefics2 model has 8 billion parameters, that would require about 16GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), that requires only about 3.5GB of RAM.

-Quantizing a model is as simple as passing a `quantization_config` to the model. One can change the code snippet above with the changes below. We'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization.md) for other quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. One can change the code snippet above with the changes below. We'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization) for other quantization methods):

 ```diff
 + from transformers import BitsAndBytesConfig
@ -193,7 +193,7 @@ model = Idefics2ForConditionalGeneration.from_pretrained(

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Idefics2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

- A notebook on how to fine-tune Idefics2 on a custom dataset using the [Trainer](../main_classes/trainer.md) can be found [here](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing). It supports both full fine-tuning as well as (quantized) LoRa.
+- A notebook on how to fine-tune Idefics2 on a custom dataset using the [Trainer](../main_classes/trainer) can be found [here](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing). It supports both full fine-tuning as well as (quantized) LoRa.
 - A script regarding how to fine-tune Idefics2 using the TRL library can be found [here](https://gist.github.com/edbeeching/228652fc6c2b29a1641be5a5778223cb).
 - Demo notebook regarding fine-tuning Idefics2 for JSON extraction use cases can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Idefics2). 🌎

--- a/docs/source/en/model_doc/janus.md
+++ b/docs/source/en/model_doc/janus.md
@ -44,11 +44,11 @@ Here is the example of visual understanding with a single image.
 > Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts.

 ```python
-import torch  
-from PIL import Image  
-import requests  
+import torch
+from PIL import Image
+import requests

-from transformers import JanusForConditionalGeneration, JanusProcessor  
+from transformers import JanusForConditionalGeneration, JanusProcessor

 model_id = "deepseek-community/Janus-Pro-1B"
 # Prepare Input for generation.
@ -64,7 +64,7 @@ messages = [

 # Set generation mode to `text` to perform text generation.
 processor = JanusProcessor.from_pretrained(model_id)
-model = JanusForConditionalGeneration.from_pretrained(model_id,     
+model = JanusForConditionalGeneration.from_pretrained(model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto")

@ -209,6 +209,10 @@ for i, image in enumerate(images['pixel_values']):

 [[autodoc]] JanusImageProcessor

+## JanusImageProcessorFast
+
+[[autodoc]] JanusImageProcessorFast
+
 ## JanusVisionModel

 [[autodoc]] JanusVisionModel
--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@ -107,7 +107,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size

    ```py
    # Easy visualization using the built-in plotting method
-    processor.plot_keypoint_matching(images, processed_outputs)
+    processor.visualize_keypoint_matching(images, processed_outputs)
    ```

 <div class="flex justify-center">
@ -128,7 +128,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size

 - preprocess
 - post_process_keypoint_matching
- plot_keypoint_matching
+- visualize_keypoint_matching

 <frameworkcontent>
 <pt>
--- a/docs/source/en/model_doc/mask2former.md
+++ b/docs/source/en/model_doc/mask2former.md
@ -77,4 +77,12 @@ The resource should ideally demonstrate something new instead of duplicating an
    - encode_inputs
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## Mask2FormerImageProcessorFast
+
+[[autodoc]] Mask2FormerImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
--- a/docs/source/en/model_doc/maskformer.md
+++ b/docs/source/en/model_doc/maskformer.md
@ -76,6 +76,14 @@ This model was contributed by [francesco](https://huggingface.co/francesco). The
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation

+## MaskFormerImageProcessorFast
+
+[[autodoc]] MaskFormerImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
 ## MaskFormerFeatureExtractor

 [[autodoc]] MaskFormerFeatureExtractor
--- a/docs/source/en/model_doc/mgp-str.md
+++ b/docs/source/en/model_doc/mgp-str.md
@ -33,7 +33,7 @@ alt="drawing" width="600"/>

 <small> MGP-STR architecture. Taken from the <a href="https://huggingface.co/papers/2209.03592">original paper</a>. </small>

-MGP-STR is trained on two synthetic datasets [MJSynth]((http://www.robots.ox.ac.uk/~vgg/data/text/)) (MJ) and [SynthText](http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
+MGP-STR is trained on two synthetic datasets [MJSynth](http://www.robots.ox.ac.uk/~vgg/data/text/) (MJ) and [SynthText](http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
 This model was contributed by [yuekun](https://huggingface.co/yuekun). The original code can be found [here](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR).

 ## Inference example
--- a/docs/source/en/model_doc/mimi.md
+++ b/docs/source/en/model_doc/mimi.md
@ -14,30 +14,29 @@ rendered properly in your Markdown viewer.

 -->

-# Mimi
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Mimi

-The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”.
+[Mimi](huggingface.co/papers/2410.00037) is a neural audio codec model with pretrained and quantized variants, designed for efficient speech representation and compression. The model operates at 1.1 kbps with a 12 Hz frame rate and uses a convolutional encoder-decoder architecture combined with a residual vector quantizer of 16 codebooks. Mimi outputs dual token streams i.e. semantic and acoustic to balance linguistic richness with high fidelity reconstruction. Key features include a causal streaming encoder for low-latency use, dual-path tokenization for flexible downstream generation, and integration readiness with large speech models like Moshi.

-The abstract from the paper is the following:
+You can find the original Mimi checkpoints under the [Kyutai](https://huggingface.co/kyutai/models?search=mimi) organization.

-*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* 
+>[!TIP]
+> This model was contributed by [ylacombe](https://huggingface.co/ylacombe).
+>
+> Click on the Mimi models in the right sidebar for more examples of how to apply Mimi.

-Its architecture is based on [Encodec](model_doc/encodec) with several major differences:
-* it uses a much lower frame-rate.
-* it uses additional transformers for encoding and decoding for better latent contextualization
-* it uses a different quantization scheme: one codebook is dedicated to semantic projection.
+The example below demonstrates how to encode and decode audio with the [`AutoModel`] class.

-## Usage example 
-
-Here is a quick example of how to encode and decode an audio using this model:
+<hfoptions id="usage">
+<hfoption id="AutoModel">

 ```python 
 >>> from datasets import load_dataset, Audio
@ -59,9 +58,8 @@ Here is a quick example of how to encode and decode an audio using this model:
 >>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
 ```

-This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe).
-The original code can be found [here](https://github.com/kyutai-labs/moshi).
-
+</hfoption>
+</hfoptions>

 ## MimiConfig

@ -72,4 +70,4 @@ The original code can be found [here](https://github.com/kyutai-labs/moshi).
 [[autodoc]] MimiModel
    - decode
    - encode
-    - forward
+    - forward
--- a/docs/source/en/model_doc/minimax.md
+++ b/docs/source/en/model_doc/minimax.md
@ -115,9 +115,9 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech

 ## Shrinking down MiniMax using quantization

-As the MiniMax model has 456 billion parameters, that would require about 912GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), about 228 GB of RAM is required.
+As the MiniMax model has 456 billion parameters, that would require about 912GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), about 228 GB of RAM is required.

-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization) for alternative quantization methods):

 ```python
 >>> import torch
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@ -13,116 +13,125 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&amp;logo=pytorch&amp;logoColor=white">
+    </div>
+</div>

-# Mistral3
+# Mistral 3

-## Overview
+[Mistral 3](https://mistral.ai/news/mistral-small-3) is a latency optimized model with a lot fewer layers to reduce the time per forward pass. This model adds vision understanding and supports long context lengths of up to 128K tokens without compromising performance.

-Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks.
+You can find the original Mistral 3 checkpoints under the [Mistral AI](https://huggingface.co/mistralai/models?search=mistral-small-3) organization.

-It is ideal for:
- Fast-response conversational agents.
- Low-latency function calling.
- Subject matter experts via fine-tuning.
- Local inference for hobbyists and organizations handling sensitive data.
- Programming and math reasoning.
- Long document understanding.
- Visual understanding.

-This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).
+> [!TIP]
+> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).
+> Click on the Mistral3 models in the right sidebar for more examples of how to apply Mistral3 to different tasks.

-The original code can be found [here](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/pixtral.py) and [here](https://github.com/mistralai/mistral-common).
+The example below demonstrates how to generate text for an image with [`Pipeline`] and the [`AutoModel`] class.

-## Usage example
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-### Inference with Pipeline
+```py
+import torch
+from transformers import pipeline

-Here is how you can use the `image-text-to-text` pipeline to perform inference with the `Mistral3` models in just a few lines of code:
-```python
->>> from transformers import pipeline
+messages = [
+    {"role": "user",
+        "content":[
+            {"type": "image",
+            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
+            {"type": "text", "text": "Describe this image."}
+        ,]
+    ,}
+,]

->>> messages = [
-...     {
-...         "role": "user",
-...         "content": [
-...             {
-...                 "type": "image",
-...                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
-...             },
-...             {"type": "text", "text": "Describe this image."},
-...         ],
-...     },
-... ]
+pipeline = pipeline(
+    task="image-text-to-text", 
+    model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", 
+    torch_dtype=torch.bfloat16,
+    device=0
+)
+outputs = pipeline(text=messages, max_new_tokens=50, return_full_text=False)

->>> pipe = pipeline("image-text-to-text", model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", torch_dtype=torch.bfloat16)
->>> outputs = pipe(text=messages, max_new_tokens=50, return_full_text=False)
->>> outputs[0]["generated_text"]
+outputs[0]["generated_text"]
 'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
 ```
-### Inference on a single image
+</hfoption>
+<hfoption id="AutoModel">

-This example demonstrates how to perform inference on a single image with the Mistral3 models using chat templates.
+```py
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText 

-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
->>> import torch
+torch_device = "cuda"
+model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_checkpoint, 
+    device_map=torch_device, 
+    torch_dtype=torch.bfloat16
+)

->>> torch_device = "cuda"
->>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+messages = [
+    {"role": "user",
+        "content":[
+            {"type": "image",
+            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
+            {"type": "text", "text": "Describe this image."}
+        ,]
+    ,}
+,]

->>> messages = [
-...     {
-...         "role": "user",
-...         "content": [
-...             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-...             {"type": "text", "text": "Describe this image"},
-...         ],
-...     }
-... ]
+inputs = processor.apply_chat_template(
+    messages, 
+    add_generation_prompt=True, 
+    tokenize=True, return_dict=True, 
+    return_tensors="pt").to(model.device, dtype=torch.bfloat16)

->>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+generate_ids = model.generate(**inputs, max_new_tokens=20)
+decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)

->>> generate_ids = model.generate(**inputs, max_new_tokens=20)
->>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-
->>> decoded_output
-"The image depicts two cats lying on a pink blanket. The larger cat, which appears to be an"...
+decoded_output
+'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
 ```
+</hfoption>
+</hfoptions>

-### Text-only generation
-This example shows how to generate text using the Mistral3 model without providing any image input.
+## Notes 

+- Mistral 3 supports text-only generation. 
+```py 
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch

-````python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
->>> import torch
+torch_device = "cuda"
+model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

->>> torch_device = "cuda"
->>> model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
+user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."

->>> SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
->>> user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."
+messages = [
+    {"role": "system", "content": SYSTEM_PROMPT},
+    {"role": "user", "content": user_prompt},
+]

->>> messages = [
-...    {"role": "system", "content": SYSTEM_PROMPT},
-...    {"role": "user", "content": user_prompt},
-... ]
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
+generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]

->>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
->>> inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
->>> generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
->>> decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]
-
->>> print(decoded_output)
+print(decoded_output)
 "1. À plus tard!
-2. Salut, à plus!
-3. À toute!
-4. À la prochaine!
-5. Je me casse, à plus!
+ 2. Salut, à plus!
+ 3. À toute!
+ 4. À la prochaine!
+ 5. Je me casse, à plus!

 ```
 /\_/\
@ -131,98 +140,93 @@ This example shows how to generate text using the Mistral3 model without providi
 ```"
 ````

-### Batched image and text inputs
-Mistral3 models also support batched image and text inputs.
+- Mistral 3 accepts batched image and text inputs. 
+```py
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch

-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
->>> import torch
+torch_device = "cuda"
+model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

->>> torch_device = "cuda"
->>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
-
->>> messages = [
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-...                 {"type": "text", "text": "Write a haiku for this image"},
-...             ],
-...         },
-...     ],
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-...                 {"type": "text", "text": "Describe this image"},
-...             ],
-...         },
-...     ],
-... ]
+messages = [
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                 {"type": "text", "text": "Write a haiku for this image"},
+             ],
+         },
+     ],
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                 {"type": "text", "text": "Describe this image"},
+             ],
+         },
+     ],
+ ]


->>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+ inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

->>> output = model.generate(**inputs, max_new_tokens=25)
+ output = model.generate(**inputs, max_new_tokens=25)

->>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
->>> decoded_outputs
+ decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+ decoded_outputs
 ["Write a haiku for this imageCalm waters reflect\nWhispers of the forest's breath\nPeace on wooden path"
 , "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"]
 ```

-### Batched multi-image input and quantization with BitsAndBytes
-This implementation of the Mistral3 models supports batched text-images inputs with different number of images for each text.
-This example also how to use `BitsAndBytes` to load the model in 4bit quantization.
+- Mistral 3 also supported batched image and text inputs with a different number of images for each text. The example below quantizes the model with bitsandbytes. 

-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
->>> import torch
+```py 
+from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+import torch

->>> torch_device = "cuda"
->>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> quantization_config = BitsAndBytesConfig(load_in_4bit=True)
->>> model = AutoModelForImageTextToText.from_pretrained(
-...     model_checkpoint, quantization_config=quantization_config
-... )
+torch_device = "cuda"
+model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModelForImageTextToText.from_pretrained(
+     model_checkpoint, quantization_config=quantization_config
+ )

->>> messages = [
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-...                 {"type": "text", "text": "Write a haiku for this image"},
-...             ],
-...         },
-...     ],
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
-...                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
-...                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
-...             ],
-...         },
-...     ],
->>> ]
+messages = [
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                 {"type": "text", "text": "Write a haiku for this image"},
+             ],
+         },
+     ],
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
+                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
+                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+             ],
+         },
+     ],
+ ]

->>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+ inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

->>> output = model.generate(**inputs, max_new_tokens=25)
+ output = model.generate(**inputs, max_new_tokens=25)

->>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
->>> decoded_outputs
+ decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+ decoded_outputs
 ["Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's wooden path\nSilent forest stands guard\n", "These images depict two different landmarks. Can you identify them? Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City."]
 ```

-
 ## Mistral3Config

 [[autodoc]] Mistral3Config
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -146,9 +146,9 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech

 ## Shrinking down Mixtral using quantization

-As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.
+As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.

-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization) for alternative quantization methods):

 ```python
 >>> import torch
--- a/docs/source/en/model_doc/mm-grounding-dino.md
+++ b/docs/source/en/model_doc/mm-grounding-dino.md
@ -0,0 +1,124 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# MM Grounding DINO
+
+[MM Grounding DINO](https://arxiv.org/abs/2401.02361) model was proposed in [An Open and Comprehensive Pipeline for Unified Object Grounding and Detection](https://arxiv.org/abs/2401.02361) by Xiangyu Zhao, Yicheng Chen, Shilin Xu, Xiangtai Li, Xinjiang Wang, Yining Li, Haian Huang>.
+
+MM Grounding DINO improves upon the [Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino) by improving the contrastive class head and removing the parameter sharing in the decoder, improving zero-shot detection performance on both COCO (50.6(+2.2) AP) and LVIS (31.9(+11.8) val AP and 41.4(+12.6) minival AP).
+
+You can find all the original MM Grounding DINO checkpoints under the [MM Grounding DINO](https://huggingface.co/collections/openmmlab-community/mm-grounding-dino-688cbde05b814c4e2832f9df) collection. This model also supports LLMDet inference. You can find LLMDet checkpoints under the [LLMDet](https://huggingface.co/collections/iSEE-Laboratory/llmdet-688475906dc235d5f1dc678e) collection.
+
+> [!TIP]
+> Click on the MM Grounding DINO models in the right sidebar for more examples of how to apply MM Grounding DINO to different MM Grounding DINO tasks.
+
+The example below demonstrates how to generate text based on an image with the [`AutoModelForZeroShotObjectDetection`] class.
+
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
+from transformers.image_utils import load_image
+
+
+# Prepare processor and model
+model_id = "openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+
+# Prepare inputs
+image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = load_image(image_url)
+text_labels = [["a cat", "a remote control"]]
+inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
+
+# Run inference
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Postprocess outputs
+results = processor.post_process_grounded_object_detection(
+    outputs,
+    threshold=0.4,
+    target_sizes=[(image.height, image.width)]
+)
+
+# Retrieve the first image result
+result = results[0]
+for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
+    box = [round(x, 2) for x in box.tolist()]
+    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
+```
+
+</hfoption>
+</hfoptions>
+
+## Notes
+
+- Here's a table of models and their object detection performance results on COCO (results from [official repo](https://github.com/open-mmlab/mmdetection/blob/main/configs/mm_grounding_dino/README.md)):
+
+    |                                                              Model                                                             | Backbone |      Pre-Train Data      |   Style   |  COCO mAP  |
+    | ------------------------------------------------------------------------------------------------------------------------------ | -------- | ------------------------ | --------- | ---------- |
+    |  [mm_grounding_dino_tiny_o365v1_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg)                       |  Swin-T  |        O365,GoldG        | Zero-shot | 50.4(+2.3) |
+    |  [mm_grounding_dino_tiny_o365v1_goldg_grit](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit)             |  Swin-T  |     O365,GoldG,GRIT      | Zero-shot | 50.5(+2.1) |
+    |  [mm_grounding_dino_tiny_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det)           |  Swin-T  |     O365,GoldG,V3Det     | Zero-shot | 50.6(+2.2) |
+    |  [mm_grounding_dino_tiny_o365v1_goldg_grit_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit_v3det) |  Swin-T  |  O365,GoldG,GRIT,V3Det   | Zero-shot | 50.4(+2.0) |
+    |  [mm_grounding_dino_base_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_base_o365v1_goldg_v3det)           |  Swin-B  |     O365,GoldG,V3Det     | Zero-shot |    52.5    |
+    |  [mm_grounding_dino_base_all](https://huggingface.co/openmmlab-community/mm_grounding_dino_base_all)                                         |  Swin-B  |         O365,ALL         |     -     |    59.5    |
+    |  [mm_grounding_dino_large_o365v2_oiv6_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_large_o365v2_oiv6_goldg)           |  Swin-L  | O365V2,OpenImageV6,GoldG | Zero-shot |    53.0    |
+    |  [mm_grounding_dino_large_all](https://huggingface.co/openmmlab-community/mm_grounding_dino_large_all)                                       |  Swin-L  |  O365V2,OpenImageV6,ALL  |     -     |    60.3    |
+
+- Here's a table of MM Grounding DINO tiny models and their object detection performance on LVIS (results from [official repo](https://github.com/open-mmlab/mmdetection/blob/main/configs/mm_grounding_dino/README.md)):
+
+    |                                                              Model                                                             |    Pre-Train Data     | MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |
+    | ------------------------------------------------------------------------------------------------------------------------------ | --------------------- | ----------- | ----------- | ----------- | ----------- | ---------- | ---------- | ---------- | ----------- |
+    |  [mm_grounding_dino_tiny_o365v1_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg)                       |      O365,GoldG       |    28.1     |    30.2     |    42.0     | 35.7(+6.9)  |    17.1    |    22.4    |    36.5    | 27.0(+6.9)  |
+    |  [mm_grounding_dino_tiny_o365v1_goldg_grit](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit)             |    O365,GoldG,GRIT    |    26.6     |    32.4     |    41.8     | 36.5(+7.7)  |    17.3    |    22.6    |    36.4    | 27.1(+7.0)  |
+    |  [mm_grounding_dino_tiny_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det)           |   O365,GoldG,V3Det    |    33.0     |    36.0     |    45.9     | 40.5(+11.7) |    21.5    |    25.5    |    40.2    | 30.6(+10.5) |
+    |  [mm_grounding_dino_tiny_o365v1_goldg_grit_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit_v3det) | O365,GoldG,GRIT,V3Det |    34.2     |    37.4     |    46.2     | 41.4(+12.6) |    23.6    |    27.6    |    40.5    | 31.9(+11.8) |
+
+
+- This implementation also supports inference for [LLMDet](https://github.com/iSEE-Laboratory/LLMDet). Here's a table of LLMDet models and their performance on LVIS (results from [official repo](https://github.com/iSEE-Laboratory/LLMDet)):
+
+    |                             Model                         | Pre-Train Data            |  MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |
+    | --------------------------------------------------------- | -------------------------------------------- | ------------ | ----------- | ----------- | ----------- | ---------- | ---------- | ---------- | ----------- |
+    | [llmdet_tiny](https://huggingface.co/iSEE-Laboratory/llmdet_tiny)   | (O365,GoldG,GRIT,V3Det) + GroundingCap-1M    | 44.7         | 37.3        | 39.5        | 50.7        | 34.9       | 26.0       | 30.1       | 44.3        |
+    | [llmdet_base](https://huggingface.co/iSEE-Laboratory/llmdet_base)   | (O365,GoldG,V3Det) + GroundingCap-1M         | 48.3         | 40.8        | 43.1        | 54.3        | 38.5       | 28.2       | 34.3       | 47.8        |
+    | [llmdet_large](https://huggingface.co/iSEE-Laboratory/llmdet_large) | (O365V2,OpenImageV6,GoldG) + GroundingCap-1M | 51.1         | 45.1        | 46.1        | 56.6        | 42.0       | 31.6       | 38.8       | 50.2        |
+
+
+## MMGroundingDinoConfig
+
+[[autodoc]] MMGroundingDinoConfig
+
+## MMGroundingDinoModel
+
+[[autodoc]] MMGroundingDinoModel
+    - forward
+
+## MMGroundingDinoForObjectDetection
+
+[[autodoc]] MMGroundingDinoForObjectDetection
+    - forward
--- a/docs/source/en/model_doc/modernbert-decoder.md
+++ b/docs/source/en/model_doc/modernbert-decoder.md
@ -24,14 +24,18 @@ rendered properly in your Markdown viewer.

 # ModernBERT Decoder

-ModernBERT Decoder is the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but trained from scratch with a causal language modeling (CLM) objective. This allows for using the same architecture for comparing encoders and decoders. This is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.
+ModernBERT Decoder has the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but it is trained from scratch with a causal language modeling objective from the [Ettin paper](https://huggingface.co/papers/2507.11412). This allows for using the same architecture to compare encoders and decoders. This model is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.

-Like the encoder version, ModernBERT Decoder incorporates modern architectural improvements such as rotary positional embeddings to support sequences of up to 8192 tokens, unpadding to avoid wasting compute on padding tokens, GeGLU layers, and alternating attention patterns. However, it uses causal (unidirectional) attention to enable autoregressive generation.
+ModernBERT Decoder uses sliding window attention and rotary positional embeddings for efficiency and to handle longer sequences.
+
+You can find all the original ModernBERT Decoder checkpoints under the [jhu-clsp](https://huggingface.co/collections/jhu-clsp/encoders-vs-decoders-the-ettin-suite-686303e16142257eed8e6aeb) collection.

 > [!TIP]
+> This model was contributed by [orionw](https://huggingface.co/orionweller).
+>
 > Click on the ModernBERT Decoder models in the right sidebar for more examples of how to apply ModernBERT Decoder to different text generation tasks.

-The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`], and from the command line.
+The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`] (with and without quantization), and from the command line. 

 <hfoptions id="usage">
 <hfoption id="Pipeline">
@ -42,7 +46,7 @@ from transformers import pipeline

 generator = pipeline(
    task="text-generation",
-    model="blab-jhu/test-32m-dec",
+    model="jhu-clsp/ettin-decoder-17m",
    torch_dtype=torch.float16,
    device=0
 )
@ -51,7 +55,7 @@ generator("The future of artificial intelligence is", max_length=50, num_return_
 # For sequence classification
 classifier = pipeline(
    task="text-classification",
-    model="blab-jhu/test-32m-dec",
+    model="jhu-clsp/ettin-decoder-17m",
    torch_dtype=torch.float16,
    device=0
 )
@ -65,9 +69,9 @@ classifier("This movie is really great!")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

-tokenizer = AutoTokenizer.from_pretrained("blab-jhu/test-32m-dec")
+tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-17m")
 model = AutoModelForCausalLM.from_pretrained(
-    "blab-jhu/test-32m-dec",
+    "jhu-clsp/ettin-decoder-17m",
    torch_dtype=torch.float16,
    device_map="auto",
 )
@ -92,7 +96,7 @@ print(f"Generated text: {generated_text}")
 from transformers import AutoModelForSequenceClassification

 classifier_model = AutoModelForSequenceClassification.from_pretrained(
-    "blab-jhu/test-32m-dec",
+    "jhu-clsp/ettin-decoder-17m",
    torch_dtype=torch.float16,
    device_map="auto",
    num_labels=2
@ -111,15 +115,53 @@ print(f"Prediction probabilities: {predictions}")
 ```

 </hfoption>
+
+<hfoption id="AutoModel (w/quantization)">
+
+```
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+)
+
+tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-1b")
+model = AutoModelForCausalLM.from_pretrained(
+    "jhu-clsp/ettin-decoder-1b",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+prompt = "The future of artificial intelligence is"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model.generate(
+        **inputs,
+        max_length=50,
+        num_return_sequences=1,
+        temperature=0.7,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
+    )
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(f"Generated text: {generated_text}")
+```
+</hfoption>
+
 <hfoption id="transformers CLI">

 ```bash
-echo "The future of artificial intelligence is" | transformers run --task text-generation --model your-username/modernbert-decoder-base --device 0
+echo "The future of artificial intelligence is" | transformers run --task text-generation --model jhu-clsp/ettin-decoder-17m --device 0
 ```

 </hfoption>
 </hfoptions>

+
 ## ModernBertDecoderConfig

 [[autodoc]] ModernBertDecoderConfig
@ -142,14 +184,5 @@ echo "The future of artificial intelligence is" | transformers run --task text-g
 [[autodoc]] ModernBertDecoderForSequenceClassification
    - forward

-### Usage tips
-
-The ModernBertDecoder model can be fine-tuned for various text generation tasks using the HuggingFace Transformers library. It supports efficient inference with features like:
-
- **Causal attention**: Ensures autoregressive generation by masking future tokens
- **Sliding window attention**: Alternates between local and global attention patterns for efficiency
- **Rotary positional embeddings**: Enables handling of longer sequences up to 8000 tokens
- **FlashAttention support**: Optimized attention computation for faster training and inference
-
 </pt>
 </frameworkcontent>
--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@ -115,6 +115,11 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran
 [[autodoc]] ModernBertForTokenClassification
    - forward

+## ModernBertForMultipleChoice
+
+[[autodoc]] ModernBertForMultipleChoice
+    - forward
+
 ## ModernBertForQuestionAnswering

 [[autodoc]] ModernBertForQuestionAnswering
--- a/docs/source/en/model_doc/mt5.md
+++ b/docs/source/en/model_doc/mt5.md
@ -14,54 +14,115 @@ rendered properly in your Markdown viewer.

 -->

-# mT5
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
+    </div>
 </div>

-## Overview
+# mT5

-The mT5 model was presented in [mT5: A massively multilingual pre-trained text-to-text transformer](https://huggingface.co/papers/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
-Siddhant, Aditya Barua, Colin Raffel.
+[mT5](https://huggingface.co/papers/2010.11934) is a multilingual variant of [T5](./t5), training on 101 languages. It also incorporates a new "accidental translation" technique to prevent the model from incorrectly translating predictions into the wrong language.

-The abstract from the paper is the following:
+You can find all the original [mT5] checkpoints under the [mT5](https://huggingface.co/collections/google/mt5-release-65005f1a520f8d7b4d039509) collection.

-*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
-state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
-multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
-the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
-benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
-generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
-checkpoints used in this work are publicly available.*
+> [!TIP]
+> This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+>
+> Click on the mT5 models in the right sidebar for more examples of how to apply mT5 to different language tasks.

-Note: mT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
-Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
-Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
-fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.

-Google has released the following variants:
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- [google/mt5-small](https://huggingface.co/google/mt5-small)
+```python
+import torch
+from transformers import pipeline

- [google/mt5-base](https://huggingface.co/google/mt5-base)
+pipeline = pipeline(
+    task="text2text-generation",
+    model="csebuetnlp/mT5_multilingual_XLSum",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("""Plants are remarkable organisms that produce their own food using a method called photosynthesis.
+This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
+Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems.""")
+```

- [google/mt5-large](https://huggingface.co/google/mt5-large)
+</hfoption>
+<hfoption id="AutoModel">

- [google/mt5-xl](https://huggingface.co/google/mt5-xl)
+```python
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

- [google/mt5-xxl](https://huggingface.co/google/mt5-xxl).
+tokenizer = AutoTokenizer.from_pretrained(
+    "csebuetnlp/mT5_multilingual_XLSum"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "csebuetnlp/mT5_multilingual_XLSum",
+    torch_dtype=torch.float16,
+    device_map="auto",
+)

-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
-found [here](https://github.com/google-research/multilingual-t5).
+input_text = """Plants are remarkable organisms that produce their own food using a method called photosynthesis.
+This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
+Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

-## Resources
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```

- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Plants are remarkable organisms that produce their own food using a method called photosynthesis." | transformers run --task text2text-generation --model csebuetnlp/mT5_multilingual_XLSum --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```python
+import torch
+from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "csebuetnlp/mT5_multilingual_XLSum",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "csebuetnlp/mT5_multilingual_XLSum"
+)
+input_text = """Plants are remarkable organisms that produce their own food using a method called photosynthesis.
+This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
+Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- mT5 must be fine-tuned for downstream tasks because it was only pretrained on the [mc4](https://huggingface.co/datasets/mc4) dataset.

 ## MT5Config

--- a/docs/source/en/model_doc/olmoe.md
+++ b/docs/source/en/model_doc/olmoe.md
@ -14,27 +14,89 @@ rendered properly in your Markdown viewer.

 -->

-# OLMoE
-
+<div style="float: right;">
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
+</div>

-## Overview
+# OLMoE

-The OLMoE model was proposed in [OLMoE: Open Mixture-of-Experts Language Models](https://huggingface.co/papers/2409.02060) by Niklas Muennighoff, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Jacob Morrison, Sewon Min, Weijia Shi, Pete Walsh, Oyvind Tafjord, Nathan Lambert, Yuling Gu, Shane Arora, Akshita Bhagia, Dustin Schwenk, David Wadden, Alexander Wettig, Binyuan Hui, Tim Dettmers, Douwe Kiela, Ali Farhadi, Noah A. Smith, Pang Wei Koh, Amanpreet Singh, Hannaneh Hajishirzi.
+[OLMoE](https://huggingface.co/papers/2409.02060) is a sparse Mixture-of-Experts (MoE) language model with 7B parameters but only 1B parameters are used per input token. It has similar inference costs as dense models but trains ~3x faster. OLMoE uses fine-grained routing with 64 small experts in each layer and uses a dropless token-based routing algorithm.

-OLMoE is a series of **O**pen **L**anguage **Mo**dels using sparse **M**ixture-**o**f-**E**xperts designed to enable the science of language models. We release all code, checkpoints, logs, and details involved in training these models.
+You can find all the original OLMoE checkpoints under the [OLMoE](https://huggingface.co/collections/allenai/olmoe-november-2024-66cf678c047657a30c8cd3da) collection.

-The abstract from the paper is the following:
+> [!TIP]
+> This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
+>
+> Click on the OLMoE models in the right sidebar for more examples of how to apply OLMoE to different language tasks.

-*We introduce OLMoE, a fully open, state-of-the-art language model leveraging sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but uses only 1B per input token. We pretrain it on 5 trillion tokens and further adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available models with similar active parameters, even surpassing larger ones like Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE training, analyze routing in our model showing high specialization, and open-source all aspects of our work: model weights, training data, code, and logs.*
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class.

-This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
-The original code can be found [here](https://github.com/allenai/OLMoE).
+<hfoptions id="usage">
+<hfoption id="Pipeline">

+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    task="text-generation",
+    model="allenai/OLMoE-1B-7B-0125",
+    torch_dtype=torch.float16,
+    device=0,
+)
+
+result = pipe("Dionysus is the god of")
+print(result)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto").to(device)
+tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
+
+inputs = tokenizer("Bitcoin is", return_tensors="pt")
+inputs = {k: v.to(device) for k, v in inputs.items()}
+output = model.generate(**inputs, max_length=64)
+print(tokenizer.decode(output[0]))
+```
+
+## Quantization
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+quantization_config = BitsAndBytesConfig(
+   load_in_4bit=True,
+   bnb_4bit_compute_dtype=torch.float16,
+   bnb_4bit_use_double_quant=True,
+   bnb_4bit_quant_type="nf4"
+)
+
+model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto", quantization_config=quantization_config).to(device)
+tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
+
+inputs = tokenizer("Bitcoin is", return_tensors="pt")
+inputs = {k: v.to(device) for k, v in inputs.items()}
+output = model.generate(**inputs, max_length=64)
+print(tokenizer.decode(output[0]))
+```

 ## OlmoeConfig

--- a/docs/source/en/model_doc/oneformer.md
+++ b/docs/source/en/model_doc/oneformer.md
@ -38,7 +38,7 @@ This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3

 ## Usage tips

-  OneFormer requires two inputs during inference: *image* and *task token*. 
+-  OneFormer requires two inputs during inference: *image* and *task token*.
 - During training, OneFormer only uses panoptic annotations.
 - If you want to train the model in a distributed environment across multiple nodes, then one should update the
  `get_num_masks` function inside in the `OneFormerLoss` class of `modeling_oneformer.py`. When training on multiple nodes, this should be
@ -69,7 +69,14 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerImageProcessor
    - preprocess
-    - encode_inputs
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## OneFormerImageProcessorFast
+
+[[autodoc]] OneFormerImageProcessorFast
+    - preprocess
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
@ -87,4 +94,3 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerForUniversalSegmentation
    - forward
-    
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@ -1,194 +1,101 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+           <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N9lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFmsnSos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQsKKaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScSKSBqKCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD82gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtbREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG23nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+           <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+           <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

 # OPT

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[OPT](https://huggingface.co/papers/2205.01068) is a suite of open-source decoder-only pre-trained transformers whose parameters range from 125M to 175B. OPT models are designed for casual language modeling and aim to enable responsible and reproducible research at scale. OPT-175B is comparable in performance to GPT-3 with only 1/7th the carbon footprint.

-## Overview
+You can find all the original OPT checkpoints under the [OPT](https://huggingface.co/collections/facebook/opt-66ed00e15599f02966818844) collection.

-The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://huggingface.co/papers/2205.01068) by Meta AI.
-OPT is a series of open-sourced large causal language models which perform similar in performance to GPT3.
+> [!TIP]
+> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ), [ybelkada](https://huggingface.co/ybelkada), and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+>
+> Click on the OPT models in the right sidebar for more examples of how to apply OPT to different language tasks.

-The abstract from the paper is the following:
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.

-*Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*

-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
-The original code can be found [here](https://github.com/facebookresearch/metaseq).
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+  
+```py  
+import torch
+from transformers import pipeline

-Tips:
- OPT has the same architecture as [`BartDecoder`].
- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.
+pipeline = pipeline(task="text-generation", model="facebook/opt-125m", torch_dtype=torch.float16, device=0)
+pipeline("Once upon a time, in a land far, far away,", max_length=50, num_return_sequences=1)
+```

-> [!NOTE]
-> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+device = "cuda"
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+
+prompt = ("Once upon a time, in a land far, far away, ")
+
+model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+model.to(device)
+
+generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+tokenizer.batch_decode(generated_ids)[0]
+```
+</hfoption>
+<hfoption id="transformers CLI">
+
+```py
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model facebook/opt-125m --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](..quantization/bitsandbytes) to quantize the weights to 8-bits.
+
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
+
+device = "cuda"
+
+bnb_config = BitsAndBytesConfig(load_in_8bit=True)
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-13b", torch_dtype=torch.float16, attn_implementation="sdpa", quantization_config=bnb_config)
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-13b")
+
+prompt = ("Once upon a time, in a land far, far away, ")
+
+model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+model.to(device)
+
+generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+tokenizer.batch_decode(generated_ids)[0]
+```
+
+## Notes
+
+- OPT adds an `EOS` token `</s>` to the beginning of every prompt.
+
+- The `head_mask` argument is ignored if the attention implementation isn't `"eager"`. Set `attn_implementation="eager"` to enable the `head_mask`.

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
-interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
-The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-generation" />
-
- A notebook on [fine-tuning OPT with PEFT, bitsandbytes, and Transformers](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing). 🌎
- A blog post on [decoding strategies with OPT](https://huggingface.co/blog/introducing-csearch#62-example-two---opt).
- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
- [`OPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
- [`TFOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
- [`FlaxOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling).
-
-<PipelineTag pipeline="text-classification" />
-
- [Text classification task guide](sequence_classification.md)
- [`OPTForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-
-<PipelineTag pipeline="question-answering" />
-
- [`OPTForQuestionAnswering`] is supported by this [question answering example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter
-  of the 🤗 Hugging Face Course.
-
-⚡️ Inference
-
- A blog post on [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) with OPT.
-
-
-## Combining OPT and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
-
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
-
-To load and run a model using Flash Attention 2, refer to the snippet below:
-
-```python
->>> import torch
->>> from transformers import OPTForCausalLM, GPT2Tokenizer
->>> device = "cuda" # the device to load the model onto
-
->>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
->>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
-
->>> prompt = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
-              "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
-              "there?")
-
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
->>> model.to(device)
-
->>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
->>> tokenizer.batch_decode(generated_ids)[0]
-'</s>A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for about a year.\nHuman: What is your favorite place to eat?\nStatue: I love'
-```
-
-### Expected speedups
-
-Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-2.7b` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
-
-<div style="text-align: center">
-<img src="https://user-images.githubusercontent.com/49240599/281101546-d2fca6d2-ee44-48f3-9534-ba8d5bee4531.png">
-</div>
-
-Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-350m` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
-
-<div style="text-align: center">
-<img src="https://user-images.githubusercontent.com/49240599/281101682-d1144e90-0dbc-46f4-8fc8-c6206cb793c9.png">
-</div>
-
-
-### Using Scaled Dot Product Attention (SDPA)
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-
-```python
-from transformers import OPTForCausalLM
-model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
-...
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-On a local benchmark (L40S-45GB, PyTorch 2.4.0, OS Debian GNU/Linux 11) using `float16` with
-[facebook/opt-350m](https://huggingface.co/facebook/opt-350m), we saw the
-following speedups during training and inference.
-
-### Training
-
-|    batch_size |    seq_len |  Time per batch (eager - s)   |    Time per batch (sdpa - s) |  Speedup (%)   |  Eager peak mem (MB)   |    sdpa peak mem (MB) |  Mem saving (%)   |
-|--------------:|-----------:|:------------------------------|-----------------------------:|:---------------|:-----------------------|----------------------:|:------------------|
-|             1 |        128 | 0.047                         |                        0.037 | 26.360         | 1474.611               |               1474.32 | 0.019             |
-|             1 |        256 | 0.046                         |                        0.037 | 24.335         | 1498.541               |               1499.49 | -0.063            |
-|             1 |        512 | 0.046                         |                        0.037 | 24.959         | 1973.544               |               1551.35 | 27.215            |
-|             1 |       1024 | 0.062                         |                        0.038 | 65.135         | 4867.113               |               1698.35 | 186.578           |
-|             1 |       2048 | 0.230                         |                        0.039 | 483.933        | 15662.224              |               2715.75 | 476.718           |
-|             2 |        128 | 0.045                         |                        0.037 | 20.455         | 1498.164               |               1499.49 | -0.089            |
-|             2 |        256 | 0.046                         |                        0.037 | 24.027         | 1569.367               |               1551.35 | 1.161             |
-|             2 |        512 | 0.045                         |                        0.037 | 20.965         | 3257.074               |               1698.35 | 91.778            |
-|             2 |       1024 | 0.122                         |                        0.038 | 225.958        | 9054.405               |               2715.75 | 233.403           |
-|             2 |       2048 | 0.464                         |                        0.067 | 593.646        | 30572.058              |               4750.55 | 543.548           |
-|             4 |        128 | 0.045                         |                        0.037 | 21.918         | 1549.448               |               1551.35 | -0.123            |
-|             4 |        256 | 0.044                         |                        0.038 | 18.084         | 2451.768               |               1698.35 | 44.361            |
-|             4 |        512 | 0.069                         |                        0.037 | 84.421         | 5833.180               |               2715.75 | 114.791           |
-|             4 |       1024 | 0.262                         |                        0.062 | 319.475        | 17427.842              |               4750.55 | 266.860           |
-|             4 |       2048 | OOM                           |                        0.062 | Eager OOM      | OOM                    |               4750.55 | Eager OOM         |
-|             8 |        128 | 0.044                         |                        0.037 | 18.436         | 2049.115               |               1697.78 | 20.694            |
-|             8 |        256 | 0.048                         |                        0.036 | 32.887         | 4222.567               |               2715.75 | 55.484            |
-|             8 |        512 | 0.153                         |                        0.06  | 154.862        | 10985.391              |               4750.55 | 131.245           |
-|             8 |       1024 | 0.526                         |                        0.122 | 330.697        | 34175.763              |               8821.18 | 287.428           |
-|             8 |       2048 | OOM                           |                        0.122 | Eager OOM      | OOM                    |               8821.18 | Eager OOM         |
-
-### Inference
-
-|    batch_size |    seq_len |    Per token latency eager (ms) |    Per token latency SDPA (ms) |    Speedup (%) |    Mem eager (MB) |    Mem BT (MB) |    Mem saved (%) |
-|--------------:|-----------:|--------------------------------:|-------------------------------:|---------------:|------------------:|---------------:|-----------------:|
-|             1 |        128 |                          11.634 |                          8.647 |         34.546 |           717.676 |        717.674 |            0     |
-|             1 |        256 |                          11.593 |                          8.86  |         30.851 |           742.852 |        742.845 |            0.001 |
-|             1 |        512 |                          11.515 |                          8.816 |         30.614 |           798.232 |        799.593 |           -0.17  |
-|             1 |       1024 |                          11.556 |                          8.915 |         29.628 |           917.265 |        895.538 |            2.426 |
-|             2 |        128 |                          12.724 |                         11.002 |         15.659 |           762.434 |        762.431 |            0     |
-|             2 |        256 |                          12.704 |                         11.063 |         14.83  |           816.809 |        816.733 |            0.009 |
-|             2 |        512 |                          12.757 |                         10.947 |         16.535 |           917.383 |        918.339 |           -0.104 |
-|             2 |       1024 |                          13.018 |                         11.018 |         18.147 |          1162.65  |       1114.81  |            4.291 |
-|             4 |        128 |                          12.739 |                         10.959 |         16.243 |           856.335 |        856.483 |           -0.017 |
-|             4 |        256 |                          12.718 |                         10.837 |         17.355 |           957.298 |        957.674 |           -0.039 |
-|             4 |        512 |                          12.813 |                         10.822 |         18.393 |          1158.44  |       1158.45  |           -0.001 |
-|             4 |       1024 |                          13.416 |                         11.06  |         21.301 |          1653.42  |       1557.19  |            6.18  |
-|             8 |        128 |                          12.763 |                         10.891 |         17.193 |          1036.13  |       1036.51  |           -0.036 |
-|             8 |        256 |                          12.89  |                         11.104 |         16.085 |          1236.98  |       1236.87  |            0.01  |
-|             8 |        512 |                          13.327 |                         10.939 |         21.836 |          1642.29  |       1641.78  |            0.031 |
-|             8 |       1024 |                          15.181 |                         11.175 |         35.848 |          2634.98  |       2443.35  |            7.843 |
+- Refer to this [notebook](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing) for an example of fine-tuning OPT with PEFT, bitsandbytes, and Transformers.
+- The [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) blog post demonstrates how to run OPT for inference.

 ## OPTConfig

--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@ -106,6 +106,13 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
    - post_process_object_detection
    - post_process_image_guided_detection

+## Owlv2ImageProcessorFast
+
+[[autodoc]] Owlv2ImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+    - post_process_image_guided_detection
+
 ## Owlv2Processor

 [[autodoc]] Owlv2Processor
--- a/docs/source/en/model_doc/patchtsmixer.md
+++ b/docs/source/en/model_doc/patchtsmixer.md
@ -38,7 +38,7 @@ This model was contributed by [ajati](https://huggingface.co/ajati), [vijaye12](

 ## Usage example

-The code snippet below shows how to randomly initialize a PatchTSMixer model. The model is compatible with the [Trainer API](../trainer.md).
+The code snippet below shows how to randomly initialize a PatchTSMixer model. The model is compatible with the [Trainer API](../trainer).

 ```python

--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 # Qwen2MoE


-[Qwen2MoE]((https://huggingface.co/papers/2407.10671) ) is a Mixture-of-Experts (MoE) variant of [Qwen2](./qwen2), available as a base model and an aligned chat model. It uses SwiGLU activation, group query attention and a mixture of sliding window attention and full attention. The tokenizer can also be adapted to multiple languages and codes.
+[Qwen2MoE](https://huggingface.co/papers/2407.10671) is a Mixture-of-Experts (MoE) variant of [Qwen2](./qwen2), available as a base model and an aligned chat model. It uses SwiGLU activation, group query attention and a mixture of sliding window attention and full attention. The tokenizer can also be adapted to multiple languages and codes.

 The MoE architecture uses upcyled models from the dense language models. For example, Qwen1.5-MoE-A2.7B is upcycled from Qwen-1.8B. It has 14.3B parameters but only 2.7B parameters are activated during runtime.

--- a/docs/source/en/model_doc/segformer.md
+++ b/docs/source/en/model_doc/segformer.md
@ -128,6 +128,12 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

+## SegformerImageProcessorFast
+
+[[autodoc]] SegformerImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+
 <frameworkcontent>
 <pt>

@ -175,4 +181,4 @@ If you're interested in submitting a resource to be included here, please feel f
    - call

 </tf>
-</frameworkcontent>
+</frameworkcontent>
--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@ -103,38 +103,11 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
    ```

- The example below demonstrates how to visualize matches between two images.
+- Visualize the matches between the images using the built-in plotting functionality.

    ```py
-    import matplotlib.pyplot as plt
-    import numpy as np
-
-    # Create side by side image
-    merged_image = np.zeros((max(image1.height, image2.height), image1.width + image2.width, 3))
-    merged_image[: image1.height, : image1.width] = np.array(image1) / 255.0
-    merged_image[: image2.height, image1.width :] = np.array(image2) / 255.0
-    plt.imshow(merged_image)
-    plt.axis("off")
-
-    # Retrieve the keypoints and matches
-    output = processed_outputs[0]
-    keypoints0 = output["keypoints0"]
-    keypoints1 = output["keypoints1"]
-    matching_scores = output["matching_scores"]
-
-    # Plot the matches
-    for keypoint0, keypoint1, matching_score in zip(keypoints0, keypoints1, matching_scores):
-        plt.plot(
-            [keypoint0[0], keypoint1[0] + image1.width],
-            [keypoint0[1], keypoint1[1]],
-            color=plt.get_cmap("RdYlGn")(matching_score.item()),
-            alpha=0.9,
-            linewidth=0.5,
-        )
-        plt.scatter(keypoint0[0], keypoint0[1], c="black", s=2)
-        plt.scatter(keypoint1[0] + image1.width, keypoint1[1], c="black", s=2)
-
-    plt.savefig("matched_image.png", dpi=300, bbox_inches='tight')
+    # Easy visualization using the built-in plotting method
+    processor.visualize_keypoint_matching(images, processed_outputs)
    ```

 <div class="flex justify-center">
@ -155,6 +128,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size

 - preprocess
 - post_process_keypoint_matching
+- visualize_keypoint_matching

 <frameworkcontent>
 <pt>
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@ -130,6 +130,11 @@ processed_outputs = processor.post_process_keypoint_detection(outputs, [image_si

 [[autodoc]] SuperPointImageProcessor

+- preprocess
+
+## SuperPointImageProcessorFast
+
+[[autodoc]] SuperPointImageProcessorFast
 - preprocess
 - post_process_keypoint_detection

--- a/docs/source/en/model_doc/voxtral.md
+++ b/docs/source/en/model_doc/voxtral.md
@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:

 ## Usage

-Let's first load the model!
+### Audio Instruct Mode
+
+The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
+
+➡️ audio + text instruction
 ```python
 from transformers import VoxtralForConditionalGeneration, AutoProcessor
 import torch
@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"

 processor = AutoProcessor.from_pretrained(repo_id)
 model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-```

-### Audio Instruct Mode
-
-The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
-
-➡️ audio + text instruction
-```python
 conversation = [
    {
        "role": "user",
@ -82,6 +79,15 @@ print("=" * 80)

 ➡️ multi-audio + text instruction 
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
    {
        "role": "user",
@ -113,6 +119,15 @@ print("=" * 80)

 ➡️ multi-turn:
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
    {
        "role": "user",
@ -158,6 +173,15 @@ print("=" * 80)

 ➡️ text only:
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
    {
        "role": "user",
@ -184,6 +208,15 @@ print("=" * 80)

 ➡️ audio only:
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
    {
        "role": "user",
@ -210,6 +243,15 @@ print("=" * 80)

 ➡️ batched inference!
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversations = [
    [
        {
@ -262,7 +304,16 @@ for decoded_output in decoded_outputs:
 Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!

 ```python
-inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
 inputs = inputs.to(device, dtype=torch.bfloat16)

 outputs = model.generate(**inputs, max_new_tokens=500)
--- a/docs/source/en/model_doc/xlstm.md
+++ b/docs/source/en/model_doc/xlstm.md
@ -0,0 +1,47 @@
+<!--Copyright 2025 NXAI GmbH. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# xLSTM
+
+## Overview
+
+The xLSTM model was proposed in [xLSTM: Extended Long Short-Term Memory](https://openreview.net/forum?id=ARAxPPIAhq) by Maximilian Beck*, Korbinian Pöppel*, Markus Spanring, Andreas Auer, Oleksandra Prudnikova, Michael Kopp, Günter Klambauer, Johannes Brandstetter and Sepp Hochreiter.
+xLSTM updates the original LSTM architecture to be competitive with Transformer models by introducing exponential gating, matrix memory expansion, and parallelizable training and ingestion.
+
+The [7B model](https://hf.co/NX-AI/xLSTM-7b) variant was trained by the xLSTM team Maximilian Beck, Korbinian Pöppel, Phillip Lippe, Richard Kurle, Patrick Blies, Sebastian Böck and Sepp Hochreiter at NXAI.
+
+The abstract from the paper is the following:
+
+*In the 1990s, the constant error carousel and gating were introduced as the central ideas of the Long Short-Term Memory (LSTM). Since then, LSTMs have stood the test of time and contributed to numerous deep learning success stories, in particular they constituted the first Large Language Models (LLMs). However, the advent of the Transformer technology with parallelizable self-attention at its core marked the dawn of a new era, outpacing LSTMs at scale. We now raise a simple question: How far do we get in language modeling when scaling LSTMs to billions of parameters, leveraging the latest techniques from modern LLMs, but mitigating known limitations of LSTMs? Firstly, we introduce exponential gating with appropriate normalization and stabilization techniques. Secondly, we modify the LSTM memory structure, obtaining: (i) sLSTM with a scalar memory, a scalar update, and new memory mixing, (ii) mLSTM that is fully parallelizable with a matrix memory and a covariance update rule. Integrating these LSTM extensions into residual block backbones yields xLSTM blocks that are then residually stacked into xLSTM architectures. Exponential gating and modified memory structures boost xLSTM capabilities to perform favorably when compared to state-of-the-art Transformers and State Space Models, both in performance and scaling.*
+
+This model was contributed by [NX-AI](https://huggingface.co/NX-AI).
+The original code can be found [here](https://github.com/NX-AI/xlstm).
+
+
+## xLSTMConfig
+
+[[autodoc]] xLSTMConfig
+
+## xLSTMModel
+
+[[autodoc]] xLSTMModel
+    - forward
+
+## xLSTMLMHeadModel
+
+[[autodoc]] xLSTMForCausalLM
+    - forward
--- a/docs/source/en/model_doc/yolos.md
+++ b/docs/source/en/model_doc/yolos.md
@ -13,76 +13,95 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

 # YOLOS

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[YOLOS](https://huggingface.co/papers/2106.00666) uses a [Vision Transformer (ViT)](./vit) for object detection with minimal modifications and region priors. It can achieve performance comparable to specialized object detection models and frameworks with knowledge about 2D spatial structures.

-## Overview

-The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://huggingface.co/papers/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN.
+You can find all the original YOLOS checkpoints under the [HUST Vision Lab](https://huggingface.co/hustvl/models?search=yolos) organization.

-The abstract from the paper is the following:
-
-*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png"
-alt="drawing" width="600"/>
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png" alt="drawing" width="600"/>

 <small> YOLOS architecture. Taken from the <a href="https://huggingface.co/papers/2106.00666">original paper</a>.</small>

-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS).

-## Using Scaled Dot Product Attention (SDPA)
+> [!TIP]
+> This model wasa contributed by [nielsr](https://huggingface.co/nielsr).
+> Click on the YOLOS models in the right sidebar for more examples of how to apply YOLOS to different object detection tasks.

-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
+The example below demonstrates how to detect objects with [`Pipeline`] or the [`AutoModel`] class.

-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-```
-from transformers import AutoModelForObjectDetection
-model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-base", attn_implementation="sdpa", torch_dtype=torch.float16)
-...
+```py
+import torch
+from transformers import pipeline
+
+detector = pipeline(
+    task="object-detection",
+    model="hustvl/yolos-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
 ```

-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+</hfoption>
+<hfoption id="Automodel">

-On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `hustvl/yolos-base` model, we saw the following speedups during inference.
+```py
+import torch
+from PIL import Image
+import requests
+from transformers import AutoImageProcessor, AutoModelForObjectDetection

-|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
-|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
-|            1 |                                       106 |                                        76 |                      1.39 |
-|            2 |                                       154 |                                        90 |                      1.71 |
-|            4 |                                       222 |                                       116 |                      1.91 |
-|            8 |                                       368 |                                       168 |                      2.19 |
+processor = AutoImageProcessor.from_pretrained("hustvl/yolos-base")
+model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-base", torch_dtype=torch.float16, attn_implementation="sdpa").to("cuda")
+
+url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+inputs = processor(images=image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+logits = outputs.logits.softmax(-1)
+scores, labels = logits[..., :-1].max(-1)
+boxes = outputs.pred_boxes
+
+threshold = 0.3
+keep = scores[0] > threshold
+
+filtered_scores = scores[0][keep]
+filtered_labels = labels[0][keep]
+filtered_boxes  = boxes[0][keep]
+
+width, height = image.size
+pixel_boxes = filtered_boxes * torch.tensor([width, height, width, height], device=boxes.device)
+
+for score, label, box in zip(filtered_scores, filtered_labels, pixel_boxes):
+    x0, y0, x1, y1 = box.tolist()
+    print(f"Label {model.config.id2label[label.item()]}: {score:.2f} at [{x0:.0f}, {y0:.0f}, {x1:.0f}, {y1:.0f}]")
+```
+
+</hfoption>
+</hfoptions>
+
+
+## Notes
+- Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](./detr), YOLOS doesn't require a `pixel_mask`.

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with YOLOS.
-
-<PipelineTag pipeline="object-detection"/>
-
- All example notebooks illustrating inference + fine-tuning [`YolosForObjectDetection`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS).
- Scripts for finetuning [`YolosForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
- See also: [Object detection task guide](../tasks/object_detection)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<Tip>
-
-Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
-
-</Tip>
+- Refer to these [notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS) for inference and fine-tuning with [`YolosForObjectDetection`] on a custom dataset.

 ## YolosConfig

--- a/docs/source/en/model_doc/zamba.md
+++ b/docs/source/en/model_doc/zamba.md
@ -69,11 +69,11 @@ print(tokenizer.decode(outputs[0]))
 ## Model card

 The model cards can be found at:
-* [Zamba-7B](MODEL_CARD_ZAMBA-7B-v1.md)
+* [Zamba-7B](https://huggingface.co/Zyphra/Zamba-7B-v1)


 ## Issues
-For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/zyphra/zamba-7b)
+For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba-7B-v1/discussions)


 ## License
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@ -28,7 +28,7 @@ To share a model to the Hub, you need a Hugging Face [account](https://hf.co/joi
 <hfoption id="huggingface-CLI">

 ```bash
-huggingface-cli login
+hf auth login
 ```

 </hfoption>
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -94,7 +94,7 @@ ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should

 ## Implementing a modular file

-The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from.
+The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere2](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from.

 | Component | Model |
 |---|---|
--- a/docs/source/en/open_webui.md
+++ b/docs/source/en/open_webui.md
@ -0,0 +1,22 @@
+#  Audio transcriptions with WebUI and `transformers serve`
+
+This guide shows how to do audio transcription for chat purposes, using `transformers serve` and [Open WebUI](https://openwebui.com/). This guide assumes you have Open WebUI installed on your machine and ready to run. Please refer to the examples above to use the text functionalities of `transformer serve` with Open WebUI -- the instructions are the same.
+
+To start, let's launch the server. Some of Open WebUI's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons, so you need to enable it:
+
+```shell
+transformers serve --enable-cors
+```
+
+Before you can speak into Open WebUI, you need to update its settings to use your server for speech to text (STT) tasks. Launch Open WebUI, and navigate to the audio tab inside the admin settings. If you're using Open WebUI with the default ports, [this link (default)](http://localhost:3000/admin/settings/audio) or [this link (python deployment)](http://localhost:8080/admin/settings/audio) will take you there. Do the following changes there:
+1. Change the type of "Speech-to-Text Engine" to "OpenAI";
+2. Update the address to your server's address -- `http://localhost:8000/v1` by default;
+3. Type your model of choice into the "STT Model" field, e.g. `openai/whisper-large-v3` ([available models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending)).
+
+If you've done everything correctly, the audio tab should look like this
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_openwebui_stt_settings.png"/>
+</h3>
+
+You're now ready to speak! Open a new chat, utter a few words after hitting the microphone button, and you should see the corresponding text on the chat input after the model transcribes it.
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -177,10 +177,16 @@ There are three supported implementations available.

 SDPA is used by default for PyTorch v2.1.1. and greater when an implementation is available. You could explicitly enable SDPA by setting `attn_implementation="sdpa"` in [`~PreTrainedModel.from_pretrained`] though. Certain attention parameters, such as `head_mask` and `output_attentions=True`, are unsupported and returns a warning that Transformers will fall back to the (slower) eager implementation.

+Refer to the [AttentionInterface](./attention_interface) guide to learn how to change the attention implementation after loading a model.
+
 ```py
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa")
+
+# Change the model's attention dynamically after loading it
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto")
+model.set_attention_implementation("sdpa")
 ```

 SDPA selects the most performant implementation available, but you can also explicitly select an implementation with [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager. The example below shows how to enable the FlashAttention2 implementation with `enable_flash=True`.
@ -234,7 +240,7 @@ FlashAttention2 support is currently limited to Instinct MI210, Instinct MI250 a
 </hfoption>
 </hfoptions>

-Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`]. FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.
+Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or by setting `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface). FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.

 ```py
 from transformers import AutoModelForCausalLM
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@ -0,0 +1,66 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FP-Quant
+
+[FP-Quant](https://github.com/IST-DASLab/FP-Quant) is a family of quantization algorithms tailored for the Blackwell generation of Nvidia GPUs. The goal is to allow for efficient post-training quantization (PTQ) and quantization-aware training (QAT) of LLMs in the [MXFP4 and NVFP4 data-types](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
+
+Currently, only PTQ with MXFP4 is supported. Models can either be quantized on the fly with `quantization_config=FPQuantConfig()`:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
+import torch
+
+model = AutoModelForCausalLM.from_pretrained(
+    "qwen/Qwen3-8B",
+    quantization_config=FPQuantConfig(),
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+)
+```
+
+or pre-processed with GPTQ for better quality (see [FP Format Quantization Harness](https://github.com/IST-DASLab/FP-Quant)).
+
+A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.
+
+Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
+
+> [!TIP]
+> Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
+
+## torch.compile
+
+FP-Quant is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+    "qwen/Qwen3-8B",
+    quantization_config=FPQuantConfig(),
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+)
+
+model.forward = torch.compile(model.forward, mode="max-autotune", fullgraph=True)
+```
+
+## Speedups
+
+FP-Quant currently performs best for very large batch size processing.
+
+See [QuTLASS README](https://github.com/IST-DASLab/qutlass/blob/main/README.md) for speedups.
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -30,6 +30,7 @@ Use the Space below to help you pick a quantization method depending on your har
 | [bitsandbytes](./bitsandbytes)            | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🟢 | 4/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
 | [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
 | [EETQ](./eetq)                            | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8            | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| [FP-Quant](./fp_quant)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 4           | 🔴               | 🟢                          | 🟢                      | https://github.com/IST-DASLab/FP-Quant      |
 | [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
 | [GPTQModel](./gptq)                       | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
 | [AutoGPTQ](./gptq)                        | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
--- a/docs/source/en/quantization/spqr.md
+++ b/docs/source/en/quantization/spqr.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # SpQR

-The [SpQR]((https://hf.co/papers/2306.03078)) quantization algorithm involves a 16x16 tiled bi-level group 3-bit quantization structure with sparse outliers.
+The [SpQR](https://hf.co/papers/2306.03078) quantization algorithm involves a 16x16 tiled bi-level group 3-bit quantization structure with sparse outliers.

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/spqr-diagram.png">
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@ -49,7 +49,7 @@ notebook_login()
 Make sure the [huggingface_hub[cli]](https://huggingface.co/docs/huggingface_hub/guides/cli#getting-started) package is installed and run the command below. Paste your User Access Token when prompted to log in.

 ```bash
-huggingface-cli login
+hf auth login
 ```

 </hfoption>
--- a/Show More
+++ b/Show More